diff --git a/.gitattributes b/.gitattributes index 15b248ac78dca2037cfa5695901668e766efafb8..7adb6e5f5a29449fec73e6ef2641086c28775b74 100644 --- a/.gitattributes +++ b/.gitattributes @@ -34,3 +34,278 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text OpenVid1M/video_reorg/OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/framework.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/i2v_examples.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/logo.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/reference_image.webp filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/supp_show_images.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/assets/v2v_examples.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_0.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_1.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_2.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_3.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_0.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/comparison.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/comparison_grid.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/frame_000_comparison.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/frame_001_comparison.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/frame_002_comparison.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/frame_003_comparison.png filter=lfs diff=lfs merge=lfs -text +Meissonic/InfinityStar/vae_reconstruction_test/frame_004_comparison.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/assets/example.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/assets/radar.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/assets/vidtwin.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/assets/vidtwin_demo.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_cache/VidTok/assets/example.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_cache/VidTok/assets/radar.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin_demo.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_grid_video_0.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_grid_video_1.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_grid_video_2.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_grid_video_3.png filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_video_0.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/VidTok/vidtok_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/assets/architecture.png filter=lfs diff=lfs merge=lfs -text +Meissonic/assets/demos.pdf filter=lfs diff=lfs merge=lfs -text +Meissonic/assets/demos.png filter=lfs diff=lfs merge=lfs -text +Meissonic/assets/inpaint/0eKR4M2uuL8.jpg filter=lfs diff=lfs merge=lfs -text +Meissonic/assets/inpaint/__Owak0IgJk.jpg filter=lfs diff=lfs merge=lfs -text +Meissonic/assets/outpaint/__G2yFuW7jQ.jpg filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_grid_video_0.png filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_grid_video_1.png filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_grid_video_2.png filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_grid_video_3.png filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/cosmos_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/output/9_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/9_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/A[[:space:]]black[[:space:]]an_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/A[[:space:]]cat[[:space:]]wear_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/A[[:space:]]dog[[:space:]]in[[:space:]]a_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/A[[:space:]]large[[:space:]]bo_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/A[[:space:]]robot[[:space:]]pl_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/A[[:space:]]white[[:space:]]an_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/The[[:space:]]sun[[:space:]]is_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/Three[[:space:]]boat_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output/Two[[:space:]]actors_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_180x320_16f_2bs_4\*8\*8vqvae_0_2_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_180x320_16f_2bs_4\*8\*8vqvae_0_2_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/3499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/3499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/8499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/8999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/8999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/9499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/9999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/9999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/autoencoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/decoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/encoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/autoencoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/decoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/encoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/autoencoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/decoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/encoder.jit filter=lfs diff=lfs merge=lfs -text +Meissonic/vidtok_cache/VidTok/assets/example.mp4 filter=lfs diff=lfs merge=lfs -text +Meissonic/vidtok_cache/VidTok/assets/radar.png filter=lfs diff=lfs merge=lfs -text +Meissonic/vidtok_cache/VidTok/assets/vidtwin.png filter=lfs diff=lfs merge=lfs -text +Meissonic/vidtok_cache/VidTok/assets/vidtwin_demo.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_092554-l16v7o9l/run-l16v7o9l.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_094329-qf4q6gjw/run-qf4q6gjw.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_094715-uvgb9hvt/run-uvgb9hvt.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_102454-nnww5mz8/run-nnww5mz8.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_111518-slrbepi0/run-slrbepi0.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_113103-ijl2gw6b/run-ijl2gw6b.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_2f39bee6c4969d94f6d2.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_a0ddb52b457bceac4774.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_1fc345a8cdc18e62468b.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_f4b36308698e96e11163.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_0798147230daa742b054.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_aed08910c4a8dcdc87f6.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_114426-5sh31nrg/run-5sh31nrg.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251207_162442-54o4hegd/run-54o4hegd.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_032955-tl61pd0t/run-tl61pd0t.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_040606-2dcjc9k8/run-2dcjc9k8.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_062741-qalkbn80/run-qalkbn80.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_071823-0hjx73rw/run-0hjx73rw.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_8328d2d0556a95ff2759.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_980ee3261a5cf9cce942.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_8fd26361f0705a90a632.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_cec203cb5c36d2873217.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_c061c65a6ce343b1660e.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_f047fb97b642dc30b33c.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_2805ac51dfa6ef4de083.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_e98ce360ce92d75f9a36.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_430592107b01c838d952.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_ecca4db815beca263f13.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_52422bc6ab7caedd5b8c.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_8e35b9b7d6b7a0806553.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_227067a6cd64b7cdced4.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_55ba9221da0bf3c49190.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_9a50d3903fd31767c616.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_ffacfcca81b53cb27319.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_935711ba29b3ab613691.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_bf885e1339d92cc386d1.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_bdd3a8c8c0c8a7a7d4dd.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_c8333d970fbc70e45c64.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_3d483725c07baf8663d3.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_b0f06ea56e9a9c08850c.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_60d433cf43a3cb8d1412.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_7cd8c962e4d1b79b5dcc.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_41402987f48490139945.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_c6c41d57fcadc12fd69b.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_2d21e8a2ea1688bffb9d.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_a609810c96cec2279a46.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_4a1fe2fe98784f7b8841.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_6119b9f39242430c319b.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_7e0ee18074e9b8d85c45.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_b01a808f5a897296f898.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_15555bb3e2ce8b16ddcf.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_9652b904aa757dce7aeb.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_c2d1b91c197ca101b350.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_03ba3747205343bd9935.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_cc8a6153b15016f58ad3.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_342589ce9380e8bb866b.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251208_155943-j5rc8ish/run-j5rc8ish.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_4ea9441b252682155006.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_be5afcc9b61ce7cc9765.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_7c59a605f746fefa06f3.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_e846322d8d1fe1da0c06.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_e7251adb287026b97ff8.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_fb353e591b1b0dbac386.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_4254c55c5a44dae8222b.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_880fb5b7bb7d55a41102.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_0af47ac2b0fd0a7b83b9.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_38859ead3b87553090be.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_1b3f708ccf2664b9bd84.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_96fc2c23d9374b5c001f.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_7f60fcf85257e0427cb4.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_f36bf77eea280b84a34e.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_92b8064a4e25f8ad3702.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_f6969510d28d905ce414.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_060856-ctbp97lz/run-ctbp97lz.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_2a764e89458c3c8d15fb.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_80cf7f467b6a4ea9a5d4.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_fb32391d5c492e093a1a.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_9a388a1a15b60d9f4438.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_c2c619bff47ae122a524.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_102651-55o5soqg/run-55o5soqg.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_7b2c7dbea7c77c3a3523.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_d3b01b8e129b539a85ed.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_287117d5d7643ba31ec4.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_f6b18ba278e34d44baab.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_321720abba124381620b.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_fa7af054654656754134.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_e6c1efef5a74bd11c582.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_f00b3e2c752ac3cf926a.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_67d5ba7897e123897b95.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_9c128d777c7dab549107.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_4274b237825ef8cf5d05.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_de7aecbbb4729ab5af9d.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_09fa45bbfff36049e141.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_d8fc778d368d5c2cb79c.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_141739-fk5kdvzr/run-fk5kdvzr.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251209_162337-uv3abozu/run-uv3abozu.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_030325-gkrz1ykg/run-gkrz1ykg.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_032745-o7so78o8/run-o7so78o8.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_035336-u8db4xs3/run-u8db4xs3.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_043009-5878wpml/run-5878wpml.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_045934-tcqz8xbx/run-tcqz8xbx.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_065438-svzut638/run-svzut638.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_071716-kc9aapl4/run-kc9aapl4.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_105833-im5q8jfr/run-im5q8jfr.wandb filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_654e84862d8c0a13f1b5.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_cd153009051a7d605018.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_56d7d215080b273e9155.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_736c017ca88662cd1d11.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_4f69c990d95a223b9d06.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_5fc1dbdaeeaef4847234.png filter=lfs diff=lfs merge=lfs -text +Meissonic/wandb/run-20251210_114439-mrtah7xe/run-mrtah7xe.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/Meissonic/.github/FUNDING.yml b/Meissonic/.github/FUNDING.yml new file mode 100644 index 0000000000000000000000000000000000000000..9e5327a36ebf9280d5d9e804e7f4c212508e48cb --- /dev/null +++ b/Meissonic/.github/FUNDING.yml @@ -0,0 +1,15 @@ +# These are supported funding model platforms + +github: viiika +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry +polar: # Replace with a single Polar username +buy_me_a_coffee: # Replace with a single Buy Me a Coffee username +thanks_dev: # Replace with a single thanks.dev username +custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] diff --git a/Meissonic/.gitignore b/Meissonic/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9db50f6c1af1a72f9a4ea34c1f50a7dc1e25b775 --- /dev/null +++ b/Meissonic/.gitignore @@ -0,0 +1,166 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Mac OS related +.DS_Store +*.DS_Store + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/Meissonic/InfinityStar/.gitignore b/Meissonic/InfinityStar/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b2c8e86c7d21c46399c00bff3ea178b5558eb0ac --- /dev/null +++ b/Meissonic/InfinityStar/.gitignore @@ -0,0 +1,59 @@ +*.swp +**/__pycache__/** +**/.ipynb_checkpoints/** +.idea/* +llava/ +_vis_cached/ +_vqgan/ +_vae/ +_vae*/ +ckpt/ +log/ +tb*/ +img*/ +local_output* +_auto_* +sd-vae-ft-mse/ +stable-diffusion-v1-4/ +*.pth +*.pth.tar +*.ckpt +*.log +*.ipynb +toscli +*.hydra +wandb +*.jpg +*.csv +*.tar.gz +*.bin +tmp +output +*.tsv +output/* +results/ +*.JPEG +debug/ +weights +checkpoints +ref.py +wandb +.DS_Store +ref.sh +ref.py +checkpoints_bk +*.avi +infinity/VideoVAE +saves/ +tmp.sh +ref_*.sh +tmpp.sh +ref2.sh +checkpoints_new +checkpoints_* +tmp_images +tmp_videos +shm +wget-log +data/interactive_toy_data +tools/infer_interact_480p.py.bk diff --git a/Meissonic/InfinityStar/LICENSE b/Meissonic/InfinityStar/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..8e658424f368d6553ccc5fcd7f1cfe792ac1e203 --- /dev/null +++ b/Meissonic/InfinityStar/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 FoundationVision + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Meissonic/InfinityStar/README.md b/Meissonic/InfinityStar/README.md new file mode 100644 index 0000000000000000000000000000000000000000..857009196a199b5538b013cf51737c5d6262b959 --- /dev/null +++ b/Meissonic/InfinityStar/README.md @@ -0,0 +1,187 @@ +

+ +

+ +# Infinity**⭐️**: Unified **S**pace**T**ime **A**uto**R**egressive Modeling for Visual Generation + + +

+ +[![demo platform](https://img.shields.io/badge/Play%20with%20Infinity%21-Infinity%20demo%20platform-lightblue)](http://opensource.bytedance.com/discord/invite)  +[![arXiv](https://img.shields.io/badge/arXiv%20paper-2511.04675-b31b1b.svg)](https://arxiv.org/abs/2511.04675)  +[![huggingface weights](https://img.shields.io/badge/%F0%9F%A4%97%20Weights-FoundationVision/Infinity-yellow)](https://huggingface.co/FoundationVision/InfinityStar)  + +
+

+ Infinity⭐️: Unified Spacetime AutoRegressive Modeling for Visual Generation +

+ + + +--- +## 🔥 Updates!! +* Nov 7, 2025: 🔥 Paper, Training and Inference Codes && Checkpoints && Demo Website released! +* Sep 18, 2025: 🎉 InfinityStar is accepted as NeurIPS 2025 Oral. + +## 🕹️ Try and Play with Infinity⭐️! + +We provide a [demo website](http://opensource.bytedance.com/discord/invite) for you to play with InfinityStar and generate videos. Enjoy the fun of bitwise video autoregressive modeling! + +## ✨ Overview +We introduce InfinityStar, a unified spacetime autoregressive framework for high-resolution image and dynamic video synthesis. + +- 🧠 **Unified Spacetime Model**: A purely discrete, autoregressive approach that jointly captures spatial and temporal dependencies within a single, elegant architecture. + +- 🎬 **Versatile Generation**: This unified design naturally supports a variety of generation tasks such as **text-to-image**, **text-to-video**, **image-to-video**, and **long interactive video synthesis** via straightforward temporal autoregression. + +- 🏆 **Leading Performance & Speed**: Through extensive experiments, InfinityStar scores **83.74** on VBench, outperforming all autoregressive models by large margins, even surpassing diffusion competitors like HunyuanVideo, approximately **10x** faster than leading diffusion-based methods. + +- 📖 **Pioneering High-Resolution Autoregressive Generation**: To our knowledge, InfinityStar is the first discrete autoregressive video generator capable of producing industrial-level 720p videos, setting a new standard for quality in its class. + + +### 🔥 Unified modeling for image, video generation and long interactive video synthesis 📈: + +
+ +
+ +## 🎬 Video Demos +#### General Aesthetics +
+ +
+ +#### Anime & 3D Animation +
+ +
+ +#### Motion +
+ +
+ +#### Extended Application: Long Interactive Videos +
+ +
+ +## Benchmark + +### Achieve sota performance on image generation benchmark: + +
+ Image Generation Evaluation +
+ +### Achieve sota performance on video generation benchmark: + +
+ +
+ +### Surpassing diffusion competitors like HunyuanVideo*: + +
+ +
+ + +## Visualization + +### Text to image examples + +
+ Text to Image Examples +
+ +### Image to video examples + +
+ Image to Video Examples +
+ +### Video extrapolation examples + +
+ Video Extrapolation Examples +
+ +## 📑 Open-Source Plan + - [x] Training Code + - [x] Web Demo + - [x] InfinityStar Inference Code + - [x] InfinityStar Models Checkpoints + - [x] InfinityStar-Interact Inference Code + - [x] InfinityStar-Interact Checkpoints + + +## Installation +1. We use FlexAttention to speedup training, which requires `torch>=2.5.1`. +2. Install other pip packages via `pip3 install -r requirements.txt`. + + +## Training Scripts +We provide a comprehensive workflow for training and finetuning our model, covering data organization, feature extraction, and training scripts. For detailed instructions, please refer to `data/README.md`. + +## Inference +* **720p Video Generation:** + Use `tools/infer_video_720p.py` to generate 5-second videos at 720p resolution. Due to the high computational cost of training, our released 720p model is trained for 5-second video generation. This script also supports image-to-video generation by specifying an image path. + ```bash + python3 tools/infer_video_720p.py + ``` + +* **480p Variable-Length Video Generation:** + We also provide an intermediate checkpoint for 480p resolution, capable of generating videos of 5 and 10 seconds. Since this model is not specifically optimized for Text-to-Video (T2V), we recommend using the experimental Image-to-Video (I2V) and Video-to-Video (V2V) modes for better results. To specify the video duration, you can edit the `generation_duration` variable in `tools/infer_video_480p.py` to either 5 or 10. This script also supports image-to-video and video continuation by providing a path to an image or a video. + ```bash + python3 tools/infer_video_480p.py + ``` + +* **480p Long Interactive Video Generation:** + Use `tools/infer_interact_480p.py` to generate a long interactive video in 480p. This script supports interactive video generation. You can provide a reference video and multiple prompts. The model will generate a video interactively with your assistance. + ```bash + python3 tools/infer_interact_480p.py + ``` + +## Citation +If our work assists your research, feel free to give us a star ⭐ or cite us using: + +``` +@Article{VAR, + title={Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale Prediction}, + author={Keyu Tian and Yi Jiang and Zehuan Yuan and Bingyue Peng and Liwei Wang}, + year={2024}, + eprint={2404.02905}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +``` +@misc{Infinity, + title={Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution Image Synthesis}, + author={Jian Han and Jinlai Liu and Yi Jiang and Bin Yan and Yuqi Zhang and Zehuan Yuan and Bingyue Peng and Xiaobing Liu}, + year={2024}, + eprint={2412.04431}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2412.04431}, +} +``` + +``` +@misc{InfinityStar, + title={InfinityStar: Unified Spacetime AutoRegressive Modeling for Visual Generation}, + author={Jinlai Liu and Jian Han and Bin Yan and Hui Wu and Fengda Zhu and Xing Wang and Yi Jiang and Bingyue Peng and Zehuan Yuan}, + year={2025}, + eprint={2511.04675}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2511.04675}, +} +``` + +## License +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. diff --git a/Meissonic/InfinityStar/__pycache__/train.cpython-310.pyc b/Meissonic/InfinityStar/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3793c501136f0a3e532e4403ad5ae5203b551ee8 Binary files /dev/null and b/Meissonic/InfinityStar/__pycache__/train.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png b/Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..36d2178a22cea6619461cf7de2fb1da65042b0a0 --- /dev/null +++ b/Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d14e42b1cfac29f069e8f8ce36467bdfd74367bc7ac7cd850fe2b4865f9191d +size 489964 diff --git a/Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png b/Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..71c4529dc05601ac60b42b2b33b2d8e65bf6477b --- /dev/null +++ b/Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b05f52485ee8d5a68519d42a3240873f8834ae0c67456c5158bb21a79389c39a +size 441872 diff --git a/Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png b/Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png new file mode 100644 index 0000000000000000000000000000000000000000..8cfea037ba1614f4c6708a52fcdb890589af66de --- /dev/null +++ b/Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d031b401dde7b11c7491c0733e43ac9dcc09544c4c4398ee26915753ccf9be29 +size 248596 diff --git a/Meissonic/InfinityStar/assets/framework.png b/Meissonic/InfinityStar/assets/framework.png new file mode 100644 index 0000000000000000000000000000000000000000..0edbb0b1bd45e8d07772b5a9276d16eff337af93 --- /dev/null +++ b/Meissonic/InfinityStar/assets/framework.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3fcec1d6d95698b18a2df7a27a0e8cc376d39836e1384d2d9168ff169e0721 +size 3787864 diff --git a/Meissonic/InfinityStar/assets/i2v_examples.png b/Meissonic/InfinityStar/assets/i2v_examples.png new file mode 100644 index 0000000000000000000000000000000000000000..06b46dce22bfd6fea73a6d46eb865fc951613d32 --- /dev/null +++ b/Meissonic/InfinityStar/assets/i2v_examples.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e7653bbb42541f39297e5008875bbf7a30b98dc6025e0b741f1e937c68e33e4 +size 4645655 diff --git a/Meissonic/InfinityStar/assets/logo.png b/Meissonic/InfinityStar/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..acf9150942bc513dfeeeef288f99788610e0c9c8 --- /dev/null +++ b/Meissonic/InfinityStar/assets/logo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8d796b40078c0a10ddcbf8261cc8c66fe141f4fa183d92490cc820c74d3d980 +size 536193 diff --git a/Meissonic/InfinityStar/assets/reference_image.webp b/Meissonic/InfinityStar/assets/reference_image.webp new file mode 100644 index 0000000000000000000000000000000000000000..5dcd87206ac1b30edec00c052abdeda39feb898c --- /dev/null +++ b/Meissonic/InfinityStar/assets/reference_image.webp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0964f3ac35717552547c7d2f40482124d51cfddc2ba6f56fa9427b2f8bd91156 +size 199072 diff --git a/Meissonic/InfinityStar/assets/supp_show_images.png b/Meissonic/InfinityStar/assets/supp_show_images.png new file mode 100644 index 0000000000000000000000000000000000000000..600084b6bfb33291475d33ad36f535bed6cfb500 --- /dev/null +++ b/Meissonic/InfinityStar/assets/supp_show_images.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b384868e3fff6762658cc5c8505eba67c41c355a7dc5c2d0c7856fdb8ccf163 +size 26368483 diff --git a/Meissonic/InfinityStar/assets/v2v_examples.png b/Meissonic/InfinityStar/assets/v2v_examples.png new file mode 100644 index 0000000000000000000000000000000000000000..0d10a19443d6b408acd200bc59f823cc018194ea --- /dev/null +++ b/Meissonic/InfinityStar/assets/v2v_examples.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c3c9c0db94e27f098c3ede52f83d64b66afe50eff768a2d1635cf4c8202fe0 +size 4955839 diff --git a/Meissonic/InfinityStar/cog.yaml b/Meissonic/InfinityStar/cog.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e2d3d4706f6b264b78e4bde88fee4d4a60877d3 --- /dev/null +++ b/Meissonic/InfinityStar/cog.yaml @@ -0,0 +1,46 @@ +# Configuration for Cog ⚙️ +# Reference: https://cog.run/yaml + +build: + # set to true if your model requires a GPU + gpu: true + + # a list of ubuntu apt packages to install + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.11" + + # a list of packages in the format == + python_packages: + - torch + - transformers + - easydict + - typed-argument-parser + - seaborn + - kornia + - gputil + - colorama + - omegaconf + - pandas + - timm==0.9.6 + - decord + - pytz + - pandas + - wandb + - colorama + - imageio + - einops + - openai + - httpx==0.20.0 + - opencv-python + - ipython + + # commands run after the environment is setup + run: + - pip install "pydantic<2.0" + - pip install -U flash-attn --no-build-isolation + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget +predict: "predict.py:Predictor" diff --git a/Meissonic/InfinityStar/data/README.md b/Meissonic/InfinityStar/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2770db24f354262044303bcd7b13fe84c5e4e70a --- /dev/null +++ b/Meissonic/InfinityStar/data/README.md @@ -0,0 +1,57 @@ +# Preparing and Training with Video Metadata + +This guide walks you through preparing your video metadata, splitting it for efficient training, and running the training scripts. + +## 1. Prepare Your Data in `.jsonl` Format + +Your video metadata should be organized in JSON Lines (`.jsonl`) format, where each line is a valid JSON object representing one video. + +**Example:** +```json +{ + "video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", + "begin_frame_id": 0, + "end_frame_id": 120, + "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes.", + "width": 1280, + "height": 720, + "h_div_w": 0.5625, + "fps": 24 +} +``` + +## 2. Split Metadata for Training + +For efficient training, large `.jsonl` files can be split into smaller chunks. + +```bash +python3 data/infinitystar_toy_data/split_jsonls_for_training.py --jsonl_folder_list JSONL_DIR --save_dir SAVE_DIR --chunk_size 100 +``` + +## 3. Extract Video Features + +To extract video features, modify the `extract_video_features.sh` script. Set the `video_data_path` and choose the desired resolution. + +* **480p (5s):** `pn=0.40M` +* **480p (10s):** `pn=0.40M` with `video_frames=161` +* **720p (5s):** `pn=0.90M` + +Then, run the script: +```bash +bash scripts/extract_video_features.sh +``` + +## 4. Run Training Scripts + +Once your metadata is prepared and features are extracted, you can start training. + +**480p Training (5s or 10s):** +```bash +bash scripts/train_480p.sh +``` + +**720p Training (only 5s):** +```bash +bash scripts/train_720p.sh +``` +The 480p configuration supports both 5-second and 10-second video training. For 10-second training, ensure that `video_frames` is set to `161` in `extract_video_features.sh` and `train_480p.sh`. \ No newline at end of file diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0001_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0001_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0001_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0002_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0002_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0002_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0003_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0003_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0003_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0004_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0004_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0004_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0005_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0005_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0005_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0006_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0006_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0006_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0007_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0007_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0007_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0008_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0008_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0008_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0009_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0009_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0009_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0010_0010_000000100.jsonl b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0010_0010_000000100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37f1dcbc9da82d850db70a35bafdb390e7575cda --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0010_0010_000000100.jsonl @@ -0,0 +1,100 @@ +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} +{"video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4", "begin_frame_id": 0, "end_frame_id": 120, "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes. At first, the viewer sees her face in close-up, with her hair falling over one eye, suggesting that she is looking at something or someone off-screen. As the video progresses, the facial expression changes subtly: the mouth opens slightly, the eyebrows relax, and the eyes become more relaxed. Then, her eyes close and she smiles, maintaining this peaceful state. The hair remains still, suggesting a calm and content atmosphere. In the final frames, the character's face is shown at rest, with closed eyes and a slight smile, conveying a sense of peace or happiness. The background is simple and modern, with a light-colored wall and a green decoration that looks like a plant leaf. The soft lighting enhances the calm and pleasant atmosphere of the scene. The camera focuses on the girl's face and the lens follows her movements. Overall, the video conveys a quiet emotional arc, emphasizing the subtle but important changes in the female character's demeanor.", "width": 1280, "height": 720, "h_div_w": 0.5625, "fps": 24} diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls_for_training.py b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls_for_training.py new file mode 100644 index 0000000000000000000000000000000000000000..12d9b93cb843495af84a04e9ea7ef1c9e8b19295 --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls_for_training.py @@ -0,0 +1,118 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import os +import os.path as osp +import time +import itertools +import shutil +import glob +import argparse +import json + +import tqdm +import numpy as np +import threading + +def save_lines(lines, filename): + os.makedirs(osp.dirname(filename), exist_ok=True) + with open(filename, 'w') as f: + f.writelines(lines) + del lines + +def get_part_jsonls(save_dir, total_line_number, ext='.jsonl', chunk_size=1000): + if osp.exists(save_dir): + shutil.rmtree(save_dir) + chunk_id2save_files = {} + missing = False + parts = int(np.ceil(total_line_number / chunk_size)) + for chunk_id in range(1, parts+1): + if chunk_id == parts: + num_of_lines = total_line_number - chunk_size * (parts-1) + else: + num_of_lines = chunk_size + bucket = (chunk_id-1) // 1000 + 1 + chunk_id2save_files[chunk_id] = osp.join(save_dir, f'{bucket:06d}', f'{chunk_id:04d}_{parts:04d}_{num_of_lines:09d}{ext}') + if not osp.exists(chunk_id2save_files[chunk_id]): + missing = True + return missing, chunk_id2save_files + +def split_large_txt_files(all_lines, chunk_id2save_files): + thread_list = [] + chunk_id = 1 + total = len(all_lines) + pbar = tqdm.tqdm(total=total) + chunk = [] + pbar = tqdm.tqdm(total=len(chunk_id2save_files)) + for line in all_lines: + chunk.append(line) + cur_chunk_size = int(osp.splitext(osp.basename(chunk_id2save_files[chunk_id]))[0].split('_')[-1]) + if len(chunk) >= cur_chunk_size: + pbar.update(1) + thread_list.append(threading.Thread(target=save_lines, args=(chunk, chunk_id2save_files[chunk_id]))) + thread_list[-1].start() + chunk = [] + chunk_id += 1 + if len(chunk): + import ipdb; ipdb.set_trace() + assert not len(chunk) + for thread in thread_list: + thread.join() + +from multiprocessing import Manager +lock = Manager().Lock() +def read_jsonl(jsonl_file): + with open(jsonl_file, 'r') as f: + lines = f.readlines() + global pbar + with lock: + pbar.update(1) + return lines + +def read_jsonls(jsonl_files, worker): + global pbar + from multiprocessing.pool import ThreadPool + pbar = tqdm.tqdm(total=len(jsonl_files)) + print(f'[Data Loading] Reading {len(jsonl_files)} meta files...') + all_lines = [] + if len(jsonl_files) == 1: + try: + lines_num = int(osp.splitext(jsonl_files[0])[0].split('_')[-1]) + except: + lines_num = 0 + pbar = tqdm.tqdm(total=lines_num) + with open(jsonl_files[0], 'r') as f: + for line in f: + pbar.update(1) + all_lines.append(line) + else: + with ThreadPool(worker) as pool: + for img_metas in pool.starmap(read_jsonl, [(bin_file,) for bin_file in jsonl_files]): + all_lines.extend(img_metas) + np.random.shuffle(all_lines) + return all_lines + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--jsonl_folder_list', type=str, default='', nargs='+', help='patha pathb pathc') + parser.add_argument('--save_dir', type=str, default='') + parser.add_argument('--chunk_size', type=int, default=1000) + parser.add_argument('--worker', type=int, default=128) + args = parser.parse_args() + + global pbar + t1 = time.time() + jsonl_files = [] + for item in args.jsonl_folder_list: + jsonl_files += glob.glob(osp.join(item, '*.jsonl')) + np.random.shuffle(jsonl_files) + + pbar = tqdm.tqdm(total=len(jsonl_files)) + lines = read_jsonls(jsonl_files, args.worker) + lines = lines * 1000 + print(f'total {len(lines)} lines') + line_num = len(lines) + missing, chunk_id2save_files = get_part_jsonls(args.save_dir, line_num, chunk_size=args.chunk_size) + + split_large_txt_files(lines, chunk_id2save_files) + t2 = time.time() + print(f'split takes {t2-t1}s') diff --git a/Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 b/Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5d02cdfd08d76ccc1edfd22be3fa191415c3b93c --- /dev/null +++ b/Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aba75b0f17a90f9a150bd331f36b64ad4ef5bd298c3cf09e6b77a005b70b8df +size 4908102 diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 b/Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..122eb7a4d0b81ae9c20757447c2e98fdc9dee6e7 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83cf64485c51f1cbbfdc1d627d7ce15de72ddce8c46de54adaf1231bf4a9313 +size 8972169 diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/prompt.txt b/Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..04475b2295eea8cdc0cf435ef3577fdc18f05e51 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/prompt.txt @@ -0,0 +1,4 @@ +The office is tidy, with a large desk covered in papers, a laptop, and a cup of coffee. A man in a white shirt sits at the desk, typing on the laptop keyboard. A desk lamp is turned on, casting light on the workspace. +The man stops typing and picks up the cup of coffee from the desk. +The man takes a sip from the coffee cup. +The man sets the coffee cup down and opens a notebook lying next to the laptop. diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 b/Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..53256e974a9db14e9fe68a557378df3346dd016f --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1f64057df76d22164b3104c50894f61927aff7897cc15b684ead3622f231937 +size 15799597 diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/prompt.txt b/Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c512e9b9321b46002df65722fe8cc64a7ce35fa --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/prompt.txt @@ -0,0 +1,4 @@ +A young boy wearing a yellow t-shirt and denim shorts is in a backyard garden. A red ball, a blue watering can, and a green garden hose lie on the grass nearby. The boy is standing next to a flower bed filled with colorful flowers, holding the blue watering can in his right hand. The sun is shining brightly overhead. +The boy lifts the watering can and starts pouring water onto the flowers in the flower bed. +The boy sets the watering can down on the grass and picks up the red ball with both hands. +The boy throws the red ball forward into the garden while standing near the flower bed. diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 b/Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6489f1dc772046f1b8b9ae7b3bb358ee8953738e --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd141840a07bb0b06f87df221eec2b705417a8aad13622f7b2298cf91d2eb2c7 +size 10632210 diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/prompt.txt b/Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb1c8b517c6ce5b2a68106e417b6dbb402ea6171 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/prompt.txt @@ -0,0 +1,4 @@ +A young woman dressed in a light gray hoodie and black leggings is sitting on a wooden bench in a city park. Around her, there are green trees, a paved walking path, and a metal water bottle placed on the bench beside her. She is holding a closed book in her lap and looking ahead thoughtfully. The sky is clear with soft afternoon sunlight filtering through the leaves. +The woman opens the book and begins to read, her eyes scanning the pages. +The woman lifts the metal water bottle and takes a sip from it. +The woman closes the book and looks up, observing the park surroundings. diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 b/Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fb9b69b9ad3a6195a7ee4532da0e4646b499c2f0 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b53e8ebcffd4dd3a6a92acb4f7c836ec51ff874258d3c43b2aa56387b06c4384 +size 13742016 diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/prompt.txt b/Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d9d1fa29a3f8470de9802b90eb2670dbdd84ef6 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/prompt.txt @@ -0,0 +1,4 @@ +The garden is filled with blooming flowers and a wooden bench near a stone path. A watering can and a pair of gardening gloves rest on the bench. A woman in a light green dress stands by the bench, holding a small potted plant with soil visible in the pot. She looks at the plant attentively. +The woman places the potted plant on the bench next to the watering can. +The woman picks up the watering can from the bench and lifts it. +The woman waters the flowers along the stone path using the watering can. diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 b/Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..63fffe18ee4d38398ef3cd91a1b397d14af48255 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:933e9fd3b2dfe640dc193357576dd8f7f894cdde1e2e9f7eba753de09a5a1ef7 +size 12703988 diff --git a/Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/prompt.txt b/Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd6cde7e89f8669f32aa5b5c98eef1000a376d60 --- /dev/null +++ b/Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/prompt.txt @@ -0,0 +1,4 @@ +A young boy wearing a yellow t-shirt and denim shorts stands in a park next to a wooden bench. On the bench lies a red soccer ball and a blue backpack. Trees with green leaves surround the area, and sunlight filters through the branches. The boy looks at the soccer ball while holding the straps of his backpack. +The boy bends down and picks up the red soccer ball from the bench. +The boy holds the soccer ball with both hands and begins to bounce it on the ground. +The boy kicks the soccer ball forward, sending it rolling across the grass. diff --git a/Meissonic/InfinityStar/evaluation/README.md b/Meissonic/InfinityStar/evaluation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..13a06050f166535e75e0846989222565526d9a77 --- /dev/null +++ b/Meissonic/InfinityStar/evaluation/README.md @@ -0,0 +1,2 @@ +# Overview +To facilitate reproducibility and evaluation, we provide the rewritten prompts used in our VBench evaluations. After generating videos with our inference script, you can evaluate their performance using the scoring tools available at [VBench](https://github.com/Vchitect/VBench). \ No newline at end of file diff --git a/Meissonic/InfinityStar/evaluation/VBench_rewrited_prompt.json b/Meissonic/InfinityStar/evaluation/VBench_rewrited_prompt.json new file mode 100644 index 0000000000000000000000000000000000000000..2403d9e7776b2cb5a1f1869f585079a01db5c41c --- /dev/null +++ b/Meissonic/InfinityStar/evaluation/VBench_rewrited_prompt.json @@ -0,0 +1,10078 @@ +[ + { + "prompt_en": "In a still frame, a stop sign", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a stop sign stands prominently against a clear blue sky backdrop. The octagonal red sign with bold white letters is positioned slightly off-center, drawing attention with its vivid color. Sunlight casts a soft shadow across the sign’s surface, emphasizing its edges and creating a subtly dramatic effect. The scene is calm, with no visible movement, and the simplicity of the sign against the expansive sky highlights its importance and message. The camera captures the scene from a slightly low angle, adding depth and perspective to the composition." + }, + { + "prompt_en": "a toilet, frozen in time", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A close-up view of a toilet frozen in time, capturing water defying gravity as it simulates mid-flush, forming splashes in mid-air. The toilet, with a gleaming white porcelain surface, is illuminated by bright, overhead lighting, creating a stark contrast with the water droplets. The camera slowly circles the scene, offering a dynamic perspective of the suspended motion. The play of light on the droplets creates a sparkling effect, highlighting the stillness of the moment despite the dynamic action typically associated with flushing." + }, + { + "prompt_en": "a laptop, frozen in time", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A sleek, modern laptop is seen frozen in time on a minimalist desk. The screen displays a paused vibrant image of a cityscape at night, with colorful lights illuminating the scene. The laptop's lid is open at a typical viewing angle, and the keyboard glows softly with backlighting. The trackpad is centered below the keyboard, and the device's surface reflects a bit of ambient light. A subtle play of shadows and lights enhances the scene, capturing the stillness of the moment. The camera gently circles around the laptop, showcasing its elegance from different angles." + }, + { + "prompt_en": "A tranquil tableau of alley", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A narrow, cobblestone alleyway is depicted, nestled between tall, old stone buildings. The alley is bathed in gentle, warm sunlight filtering through, casting soft shadows that add depth to the scene. The buildings are adorned with small, ornate wrought-iron balconies and flower boxes that overflow with vibrant blooms. At the end of the alley, a lush vine climbs the wall, adding greenery to the serene setting. The camera pans slowly from the ground up, capturing the textured stones and the charming architectural details, enhancing the peaceful and picturesque atmosphere." + }, + { + "prompt_en": "A tranquil tableau of bar", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil tableau of a stylish bar, featuring dim, ambient lighting and a warm, inviting atmosphere. Behind the bar, an elegant display of bottles glistens reflectively under the soft overhead lights. A row of bar stools with plush cushioning stands in front of the counter, adding to the cozy environment. The bartender, a handsome man with neatly styled hair and a crisp white shirt, is seen polishing a glass while offering a welcoming smile. The camera gently pans across the bar, highlighting the sophisticated setting and capturing the serene vibe of this refined space." + }, + { + "prompt_en": "A tranquil tableau of barn", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil tableau of a weathered wooden barn nestled in a serene rural landscape. The barn is surrounded by lush green grass, with a few scattered wildflowers adding a touch of color. The soft, warm light of the afternoon sun casts gentle shadows on the barn's rustic exterior, highlighting its aged wood and giving it a charming, timeless appearance. The camera slowly pans from left to right, offering a full view of the barn and its peaceful surroundings, enhancing the calm and idyllic atmosphere of the scene." + }, + { + "prompt_en": "A tranquil tableau of bathroom", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a serene bathroom with a freestanding white bathtub as the focal point. Sunlight filters through a nearby window, casting gentle, warm patterns across the tiled floor. Soft, fluffy towels hang neatly on a rack beside the tub. A potted green plant adds a touch of nature and freshness. The atmosphere is calm and peaceful. The camera pans slowly, giving a complete view of the elegantly arranged bathroom, enhancing the sense of tranquility." + }, + { + "prompt_en": "A tranquil tableau of bedroom", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The video captures a serene bedroom setting bathed in soft, natural light streaming through a window. A neatly made bed with crisp white sheets and plush pillows is at the center of the frame. Beside the bed is a small wooden nightstand with a lit lamp that emits a warm glow, casting gentle shadows. The walls are painted in a soothing pastel color, enhancing the room's calming atmosphere. A cozy armchair sits by the window, inviting relaxation. The camera slowly pans across the room, emphasizing its peaceful and inviting ambiance." + }, + { + "prompt_en": "A tranquil tableau of cliff", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil tableau of a towering cliff overlooking the vast ocean below. The cliff's rugged surface is bathed in the golden light of the setting sun, highlighting its craggy texture and natural beauty. The deep blue ocean contrasts with the vivid colors of the sunset sky, creating a serene and picturesque atmosphere. The camera gently pans across the cliffside, providing a sweeping view of the majestic landscape and the tranquil waters stretching into the horizon." + }, + { + "prompt_en": "In a still frame, courtyard", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a charming courtyard is captured, bathed in the soft afternoon light. The courtyard is surrounded by rustic stone walls adorned with climbing ivy and vibrant flowering plants. In the center, there is a quaint stone fountain with water gently cascading down its tiers. Potted plants with colorful blooms are neatly arranged along the edges, adding lively bursts of color. A wrought iron table and chairs are set to one side, inviting relaxation and conversation. The scene exudes a peaceful, serene ambiance with a touch of elegance and natural beauty." + }, + { + "prompt_en": "In a still frame, gas station", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a gas station scene is depicted under the bright midday sun. The gas station has multiple fuel pumps, each equipped with digital screens and colorful hoses. A sleek, modern car is parked at one of the pumps, facing away from the camera. The station's main building, with its large windows and a red canopy, is visible in the background. The surrounding area is clear, showcasing a clean and well-maintained environment with clear skies above." + }, + { + "prompt_en": "A tranquil tableau of house", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene depicts a tranquil tableau of a charming house nestled amidst lush greenery. The house is styled with a warm, rustic design, featuring a sloped roof and weathered wooden siding that adds to its cozy appearance. Soft sunlight filters through the surrounding trees, casting dappled shadows on the exterior walls and the ground below. A quaint porch with a few potted plants completes the inviting atmosphere. The camera gently pans from left to right, showcasing the serene setting and emphasizing the peaceful ambiance of the home." + }, + { + "prompt_en": "indoor gymnasium, frozen in time", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In an indoor gymnasium frozen in time, a group of athletes is captured mid-motion. A young man is suspended in mid-air, his body twisted and eyes focused as if going for a slam dunk. He is wearing a sleeveless shirt and athletic shorts, with every muscle and sinew on display. Around him, other athletes are similarly frozen, a woman poised to catch a basketball and another person crouched with a focused gaze, ready to jump. Bright overhead lighting illuminates the glossy wooden floor and the sports equipment scattered around, creating a dynamic yet still scene. The camera is slowly circling around the center, emphasizing the stillness of the moment." + }, + { + "prompt_en": "A tranquil tableau of indoor library", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The camera pans gently across a tranquil indoor library, showcasing tall wooden bookshelves filled with an array of books. Soft, warm lighting bathes the room, creating a cozy and inviting atmosphere. The polished wooden tables are neatly arranged with comfortable chairs, some of which have books and a few reading lamps. The camera smoothly transitions to focus on a large window that allows soft daylight to filter into the space, illuminating the peaceful setting." + }, + { + "prompt_en": "A tranquil tableau of kitchen", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene presents a tranquil tableau of a kitchen bathed in soft, natural light filtering through a nearby window. The kitchen features neatly organized countertops, with a wooden bowl filled with fresh, colorful fruits like oranges and apples. A vase with a small bouquet of flowers adds a touch of charm to the setting. In the background, a stainless steel kettle sits on the stove, and a set of matching utensils hangs above it. The subtle tones and serene atmosphere create a peaceful and inviting space. The camera slowly pans across the scene, capturing the harmonious arrangement." + }, + { + "prompt_en": "A tranquil tableau of palace", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The camera slowly approaches a grand and ornate palace, capturing its majestic architecture bathed in the golden light of the setting sun. The palace features intricately designed facades and towering spires that reach towards the sky. Surrounding the palace are beautifully manicured gardens, with lush greenery and colorful flowers adding to the tranquil scene. The atmosphere is serene and peaceful, with soft shadows enhancing the grandeur of the building. The camera gently pans to the side, offering a broader view of the palace and its picturesque surroundings." + }, + { + "prompt_en": "In a still frame, parking lot", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a parking lot is depicted with several parked cars arranged in neat rows. The scene is illuminated by overhead lights, casting a bright and even glow across the area, highlighting the various colors of the cars. Painted lines on the asphalt define each parking space, and a small tree can be seen at the edge of the lot, providing a touch of greenery. The atmosphere is orderly and calm, with a clear sky above, enhancing the overall visibility of the scene." + }, + { + "prompt_en": "In a still frame, phone booth", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a traditional red phone booth stands prominently in an urban setting. The booth, with its classic glass panels and iconic design, is positioned slightly to the left of the frame. The metal handle and keypad are visible through the glass, and the interior is well-lit, which highlights the details inside. The background consists of a blurred street scene with indistinct figures and buildings, giving the impression of a busy city environment while keeping the focus on the phone booth." + }, + { + "prompt_en": "A tranquil tableau of restaurant", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A serene scene unfolds in a cozy restaurant, where soft ambient lighting creates a warm and inviting atmosphere. The tables are elegantly set with white tablecloths and neatly arranged silverware, accompanied by small vases of fresh flowers. A single table is captured in focus, featuring a flickering candle that adds to the tranquil mood. The background reveals hints of gentle chatter and the subtle clinking of glasses, enhancing the peaceful setting. The camera gently pans across the room, showcasing the harmonious layout and calming environment." + }, + { + "prompt_en": "A tranquil tableau of tower", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the distance, a majestic tower stands tall against the backdrop of a golden sunset. The tower's intricate architectural details are subtly highlighted by the warm light of the setting sun. The sky is awash with hues of orange and pink, casting a serene atmosphere over the scene. The camera slowly pans upward from the base to the top of the tower, capturing the elegance and grandeur of the structure as it is silhouetted against the colorful sky. Sparse clouds drift lazily, adding to the tranquil and picturesque setting." + }, + { + "prompt_en": "A tranquil tableau of a bowl", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil tableau showcases a ceramic bowl resting on a rustic wooden table. The bowl, finished in a soft pastel blue, is filled with fresh, ripe strawberries that are vividly red and glistening in the gentle ambient light. A few stray leaves are scattered around the strawberries, adding a touch of greenery to the composition. The scene is illuminated with soft, natural light, creating a warm and serene atmosphere. The camera slowly zooms in, highlighting the textures and colors of the strawberries and the bowl's glaze." + }, + { + "prompt_en": "A tranquil tableau of an apple", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A single, perfectly ripe apple sits atop a wooden table, bathed in soft, warm light that highlights its glossy red skin. The apple is positioned slightly to the left, showcasing its round shape and delicate stem. The table's wooden texture complements the apple's vivid color, and a gentle shadow falls to the right, enhancing the serene and peaceful atmosphere. The camera slowly zooms in, bringing the apple's details into sharper focus and capturing its inviting appearance." + }, + { + "prompt_en": "A tranquil tableau of a bench", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil scene features a wooden bench positioned in a serene garden setting. The bench is surrounded by lush greenery and vibrant flowers, creating a peaceful atmosphere. Sunlight filters through the leaves of nearby trees, casting dappled shadows across the bench. The camera slowly pans from left to right, capturing the intricate details of the old, weathered wood and the colorful blossoms that dot the landscape. The gentle rustling of leaves and a soft breeze enhance the sense of calm and tranquility." + }, + { + "prompt_en": "A tranquil tableau of a bed", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A peaceful scene depicts a neatly made bed in a softly lit room, with natural light gently streaming in through a nearby window. The bed is adorned with crisp white sheets and a thick, plush comforter, creating an inviting and cozy atmosphere. A pair of fluffy pillows rest against the headboard, slightly angled to catch the light. The camera slowly pans from the head to the foot of the bed, capturing the serene and orderly arrangement, and emphasizing the tranquility of the setting." + }, + { + "prompt_en": "A tranquil tableau of a chair", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A single chair is placed in a serene setting, surrounded by the soft glow of warm light filtering through a nearby window. The chair, made of dark wood with a cushioned seat, is positioned invitingly, facing partially towards the camera. The atmosphere is calm and peaceful, enhanced by the gentle shadows that fall across the floor, creating an inviting and contemplative scene. The camera slowly pans from left to right, capturing the tranquil setting and the chair’s elegant design." + }, + { + "prompt_en": "A tranquil tableau of a cup", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A serene scene focuses on a white ceramic cup resting on a wooden table. The cup is filled with steaming hot tea, and delicate wisps of steam gently rise, swirling into the air. Soft, natural lighting bathes the scene, casting warm tones over the wooden surface and highlighting the cup's smooth, glossy finish. The atmosphere is peaceful and calming, with a potted green plant visible in the softly blurred background enhancing the tranquility. The camera moves slowly around the cup, capturing different angles and the steam's graceful ascent." + }, + { + "prompt_en": "A tranquil tableau of a dining table", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil tableau of a beautifully arranged dining table, bathed in soft, ambient lighting that gives the setting a warm and inviting feel. The table is elegantly set with a crisp, white tablecloth and adorned with fine china plates, polished silverware, and crystal glassware. A centerpiece composed of fresh, vibrant flowers adds a touch of color, enhancing the serene atmosphere. The camera gently pans across the table, highlighting the meticulous arrangement and detail, capturing the calmness and grace of the setting." + }, + { + "prompt_en": "In a still frame, a pear", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a ripe pear rests on a wooden surface. The pear's skin is a vibrant blend of green and yellow, with a few small brown speckles adding texture. Soft, natural light illuminates the pear from one side, casting a gentle shadow to the other side on the wood. The setting suggests a cozy, rustic atmosphere, evoking a sense of freshness and simplicity." + }, + { + "prompt_en": "A tranquil tableau of a bunch of grapes", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil tableau features a bunch of grapes resting elegantly on a simple white surface. The grapes are a rich, deep purple, with a few leaves scattered around them, adding a touch of green. Soft, diffused lighting creates gentle highlights on the grapes, emphasizing their plump and juicy appearance. The camera slowly pans across the grapes, capturing their velvety texture and the subtle sheen on their skin, evoking a serene and peaceful atmosphere." + }, + { + "prompt_en": "A tranquil tableau of a bowl on the kitchen counter", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A serene kitchen scene features a bowl resting on a clean, polished counter. The bowl is crafted from smooth, white ceramic with a slightly glossy finish, and is filled with an assortment of fresh, colorful fruits including vibrant red apples, bright yellow bananas, and deep purple grapes. The soft, natural light streaming in through a nearby window casts gentle shadows on the counter, adding to the tranquil atmosphere. The camera slowly pans to the right, capturing a glimpse of neatly arranged kitchen utensils and an inviting view of the outside garden through the window." + }, + { + "prompt_en": "A tranquil tableau of a beautiful, handcrafted ceramic bowl", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The video showcases a stunning, handcrafted ceramic bowl, beautifully displayed on a wooden table. The bowl features intricate patterns and delicate detailing, highlighting the artisan's skill. The camera slowly circles the bowl, capturing its graceful curves and the richness of its glazed finish. Soft, ambient light illuminates the scene, creating a warm and inviting atmosphere. As the camera completes its gentle rotation, the bowl's exquisite craftsmanship and unique design are fully revealed." + }, + { + "prompt_en": "A tranquil tableau of an antique bowl", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A camera gently pans over a tranquil tableau featuring an antique bowl, set upon a rich, wooden surface. The bowl, with intricate patterns and an aged patina, is beautifully illuminated by soft, warm lighting, accentuating its timeless beauty. The surrounding atmosphere is calm and serene, with hints of vintage decor in the background, enhancing the antique charm of the scene. The camera movement is slow and smooth, capturing the elegant and detailed craftsmanship of the bowl." + }, + { + "prompt_en": "A tranquil tableau of an exquisite mahogany dining table", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A beautifully crafted mahogany dining table stands in the center of the room, its rich, dark wood polished to a high sheen. Soft, warm lighting from an overhead chandelier bathes the table, enhancing the deep, luxurious tones of the wood. The table is set for a meal, adorned with elegant, white porcelain dinnerware and sparkling crystal glassware. A lush floral centerpiece with vibrant blossoms sits in the middle of the table, adding a touch of freshness and color. The camera gently circles the scene, capturing the tranquil elegance of the setting." + }, + { + "prompt_en": "A tranquil tableau of a wooden bench in the park", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The video showcases a serene scene of a wooden bench situated in a lush park. The bench, crafted from rich, dark wood, is placed in a peaceful corner surrounded by vibrant greenery. The soft sunlight filters through the leaves of nearby trees, creating dappled patterns on the bench and the grass. A gentle breeze rustles the leaves, adding to the tranquil atmosphere. The camera smoothly glides around the bench, capturing its details and the serenity of the natural setting." + }, + { + "prompt_en": "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The camera captures a stationary view of a charming wrought-iron bench nestled among vibrant blooms in a garden setting. Blossoming flowers of various colors, including reds, pinks, and yellows, surround the bench, creating a picturesque and serene atmosphere. Sunlight filters through the foliage, casting dappled patterns on the bench. The intricate design of the wrought-iron bench stands out against the lush greenery, inviting a sense of calm and relaxation. The gentle swaying of the flowers in the breeze adds a subtle motion to this tranquil scene." + }, + { + "prompt_en": "In a still frame, a park bench with a view of the lake", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a wooden park bench with ornate metal arms is centrally positioned, offering a picturesque view of the lake. The bench is slightly weathered, adding a rustic charm. Behind it, the lake spans out, shimmering under the sunlight with gentle ripples on its surface. Surrounding the bench, lush green grass and a few scattered wildflowers contribute to a serene and peaceful atmosphere. In the distance, trees line the far edge of the lake, providing a natural frame for the tranquil setting. The camera is positioned at a low angle, emphasizing the bench as it faces the inviting vista of the lake." + }, + { + "prompt_en": "A tranquil tableau of a vintage rocking chair was placed on the porch", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A vintage wooden rocking chair with ornate carvings sits peacefully on a wooden porch. The porch is adorned with potted plants that add a touch of greenery to the scene. Sunlight filters through, casting gentle shadows across the wooden floorboards. The rocking chair is angled slightly to face away from the camera, allowing a view of its beautiful craftsmanship. The atmosphere is serene, evoking a feeling of warmth and nostalgia. The camera slowly pans from left to right, showcasing the tranquil setting." + }, + { + "prompt_en": "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil tableau of a small jail cell, dimly lit with a soft, moody atmosphere. The cold, steel bars at the front of the cell form a stark grid pattern, casting shadows on the cell's floor. Inside, the bare walls and simple furnishings, such as a narrow bed and a small table, contribute to the austere environment. The lighting creates a play of light and shadow, emphasizing the isolation and stillness within the cell. The camera slowly pans across the scene, highlighting the details and textures of the space." + }, + { + "prompt_en": "A tranquil tableau of the phone booth was tucked away in a quiet alley", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A serene scene unfolds with a classic red phone booth nestled in a quiet alley. The alley is narrow and lined with cobblestones, creating a charming, old-world atmosphere. Ambient light softly illuminates the phone booth, highlighting its glossy paint and the glass panels. The surrounding walls are covered in ivy, adding a touch of greenery to the tranquil setting. The camera gently circles the booth, capturing the peaceful and nostalgic feel of this secluded location." + }, + { + "prompt_en": "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A dilapidated phone booth, with its rusted frame and chipped glass panels, stands on the sidewalk as a relic of a bygone era. The booth's faded red paint and weathered appearance evoke a sense of nostalgia. Graffiti and peeling stickers can be seen on its surfaces, adding to its neglected state. The sky above is overcast, casting a greyish hue over the scene, enhancing the booth's frozen, timeless quality. The camera slowly circles the booth, capturing its deteriorated charm from all angles." + }, + { + "prompt_en": "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil scene captures an old red barn standing weathered and iconic against the backdrop of the serene countryside. The barn's aged wood and fading red paint add a rustic charm to the setting. Surrounding the barn are tall grasses and a few scattered wildflowers swaying gently in the breeze. The sky above is clear with a few wispy clouds, and the warm golden light of the late afternoon sun bathes the entire landscape, enhancing the peaceful and nostalgic atmosphere. The camera slowly pans from right to left, highlighting the barn's architectural details and the tranquility of its surroundings." + }, + { + "prompt_en": "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil scene features a charming barn painted in a warm shade of red, nestled amidst a lush, picturesque meadow. The barn exudes a rustic appeal with its wooden structure and simple design, sitting peacefully under a clear blue sky. The meadow is dotted with wildflowers and gently swaying grass, enhancing the serene atmosphere. Soft, natural lighting bathes the barn and meadow, creating an inviting and harmonious setting. The camera takes a gentle, sweeping movement around the barn, capturing the idyllic surroundings from various angles." + }, + { + "prompt_en": "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame capturing the desolate desert, an oasis emerges, offering a stark contrast to its arid surroundings. Tall, stately palm trees stand in a cluster, their fronds reaching towards the sky, providing a touch of greenery in the vast sea of sand. At the center of the oasis is a motionless, glassy pool of water, reflecting the trees and the clear blue sky above. The scene is bathed in warm sunlight, creating a serene and tranquil atmosphere, with the sun gently illuminating the landscape. The camera remains static, allowing the viewer to fully absorb the calm and peaceful ambiance of this hidden sanctuary amidst the harsh desert environment." + }, + { + "prompt_en": "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, the Parthenon's majestic Doric columns rise prominently atop the Acropolis, exuding an aura of timeless grace and strength. The robust columns, with their characteristic fluted designs, are bathed in the soft, golden light of the setting sun, highlighting their weathered, ancient surface. Surrounding these columns, the tranquil Athenian landscape stretches into the distance, featuring lush greenery and rolling hills that add a serene backdrop to this historic scene. The stillness of the moment captures the harmonious coexistence of architectural grandeur and natural beauty. The camera gently pans across, emphasizing the symmetry and dignity of the Parthenon against the expansive, peaceful skyline." + }, + { + "prompt_en": "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, the Temple of Hephaestus is captured with its enduring Doric columns, showcasing impeccable architectural elegance. The temple stands stoically amidst the lush greenery of the surrounding landscape. The lighting highlights the texture of the stone facade, and the peaceful atmosphere is enhanced by the view of a tranquil Athens skyline in the background. The camera focuses on the intricate details of the columns, providing a sense of timelessness and historical grandeur." + }, + { + "prompt_en": "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the frame, an ornate Victorian streetlamp stands majestically, showcasing its intricate ironwork and colorful stained glass panels. The lamp's design is elaborate, with detailed patterns and decorative elements that exude a sense of historical charm and classic elegance. The scene is illuminated by soft and warm lighting, highlighting the streetlamp's artistry and craftsmanship against a gentle twilight sky. The camera offers a slightly upward angle, emphasizing the lamp's grandeur and exquisite details." + }, + { + "prompt_en": "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "An awe-inspiring view of Stonehenge unfolds, capturing the ancient monument's mysterious allure. Each colossal stone is meticulously placed, standing against a serene and picturesque backdrop. The stones cast long shadows on the lush green grass, with the soft, golden light of the setting sun enhancing the scene's tranquil atmosphere. The camera smoothly pans across the stone circle, highlighting the intricate alignment and the natural beauty surrounding this legendary site." + }, + { + "prompt_en": "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame set in the vast desert, an oasis appears nestled among the rolling dunes. Tall palm trees stand prominently around the shimmering water, their green fronds swaying gently in the light breeze. The sun casts a warm, golden hue across the sand, highlighting the tranquil and serene atmosphere of the oasis. The camera captures this scene from a slightly elevated angle, providing a broad view of the peaceful contrast between the lush greenery of the palm trees and the endless expanse of golden sand." + }, + { + "prompt_en": "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a serene desert oasis, where a cluster of tall, lush palm trees stands vibrant against the golden sands. In the foreground, a clear, calm pool of water gently reflects the palm trees and the clear blue sky above, adding to the tranquility of the setting. The lighting is warm and inviting, emphasizing the contrast between the verdant greenery of the oasis and the expansive, arid desert landscape surrounding it. The overall atmosphere is peaceful and idyllic, with no movement, as if time stands still in this beautiful desert retreat." + }, + { + "prompt_en": "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures an ornate Victorian streetlamp standing gracefully on a cobblestone street corner. The intricate ironwork of the lamp is detailed, its design featuring swirling patterns and elegant curves. The lamp emits a gentle, warm glow, casting a soft pool of light on the cobblestones below. The calm night atmosphere is enhanced by the dimly lit surroundings, with shadows stretching across the empty street. The camera slowly pans from the base of the lamp upward, emphasizing the lamp's height and craftsmanship, before settling on the warm pool of light on the cobblestone." + }, + { + "prompt_en": "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a serene lakeside cabin surrounded by tall pines, their towering forms reflected perfectly in the lake's calm waters. The cabin, with its rustic wooden exterior, exudes a peaceful, cozy charm. Gentle ripples in the water add a subtle texture to the reflection, enhancing the tranquil atmosphere. The camera slowly pans from left to right, showcasing the harmonious blend of nature and architecture, bathed in the soft, warm light of either dawn or dusk. The overall mood is one of peace and solitude." + }, + { + "prompt_en": "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a vintage gas lantern stands prominently in a historic cobblestone square. The lantern features intricate metalwork and an elegant design, exuding an antique charm. The cobblestones create a textured background, enhancing the timeless atmosphere of the scene. Soft, ambient lighting bathes the square, highlighting the lantern's details and casting gentle shadows on the cobbled surface. The camera is fixed, focusing on the lantern and capturing the serene and nostalgic setting." + }, + { + "prompt_en": "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a serene Japanese tea ceremony room, bathed in soft, natural lighting. The room features traditional tatami mats that cover the floor, creating a warm and welcoming atmosphere. In the center, a delicate tea set is elegantly arranged, consisting of intricately designed tea bowls and a graceful tea pot, all placed on a small, polished wooden tray. In the corner of the room stands a carefully pruned bonsai tree, adding an element of nature and tranquility to the space. The composition exudes a sense of calm and harmony, inviting viewers to appreciate the beauty of the traditional setting." + }, + { + "prompt_en": "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil tableau of the Parthenon, standing resolute in its classical elegance. The iconic structure, with its majestic columns and intricate details, exudes a sense of timelessness. The camera gently pans from right to left, showcasing the Parthenon against a clear blue sky. Sunlight bathes the ancient stone, highlighting the architectural grandeur. The surrounding area is serene, with a few distant trees and hills visible. The atmosphere evokes a profound sense of Athens' rich cultural legacy and enduring history." + }, + { + "prompt_en": "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the heart of Plaka, a tranquil scene shows the perfect harmony between the neoclassical architecture of the old city and the ancient ruins. The classical buildings, with their elegant columns and detailed facades, present a refined aesthetic that complements the ancient stone structures nearby. The soft glow of the afternoon sun bathes the architecture in a warm, golden light, highlighting the intricate details and weathered surfaces. The camera gently pans across the scene to showcase the seamless blend of historical eras, creating an atmosphere of timeless elegance and serene beauty." + }, + { + "prompt_en": "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene unfolds with a tranquil view of Chaco Canyon's ancient ruins, nestled amidst the stark beauty of the American Southwest. The camera slowly pans over the arid landscape, capturing the timeless whisper of stone structures that speak of an enigmatic civilization. The ruins, with their intricate stonework, stand against the backdrop of a vast, open sky. Soft sunlight bathes the scene, accentuating the warm tones of the earth and the weathered stones. The atmosphere is one of serene mystery, inviting contemplation of the long-lost lives that once flourished here." + }, + { + "prompt_en": "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The scene captures a tranquil moment at the edge of the Arabian Desert, where the ancient city of Petra reveals its enigmatic rock-carved façades. Bathed in the warm, golden light of the setting sun, the intricate carvings etched into the reddish sandstone cliffs create a mesmerizing visual display. The atmosphere is serene and filled with a sense of timelessness, as shadows play across the rugged surface of the stone. The camera gently pans to encompass the grandeur of these ancient structures, highlighting their majestic and mysterious allure." + }, + { + "prompt_en": "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame capturing a serene evening scene, an elegant Art Nouveau lamppost stands tall amidst the cobblestone streets, casting a warm, inviting glow. The intricate design of the lamppost, with its curved lines and floral motifs, reflects the charm of the surrounding architecture. The yellow light from the lamp creates gentle shadows on the cobblestones, enhancing the nostalgic and tranquil atmosphere of the scene. The soft ambient lighting evokes a sense of quiet sophistication and timeless beauty." + }, + { + "prompt_en": "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the picturesque village square, a traditional wrought-iron streetlamp stands gracefully, showcasing intricate filigree patterns that wind elegantly around its structure. The streetlamp's amber-hued glass panels emit a warm, inviting glow, casting gentle light over the cobblestone path. The camera slowly pans upwards, highlighting the craftsmanship of the lantern and revealing charming village architecture in the background, under a serene evening sky." + }, + { + "prompt_en": "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil scene features two elegant lampposts adorned with Art Deco motifs, their geometric shapes crafted from wrought iron and frosted glass shades exuding a sense of vintage glamour. The intricate patterns on the lampposts catch the light, creating a soft, ambient glow that illuminates the surroundings with a warm and inviting atmosphere. The camera slowly pans upward from the base of one lamppost to the top, showcasing the detailed craftsmanship and highlighting the stylish design elements that hearken back to a bygone era." + }, + { + "prompt_en": "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a picturesque square, a Gothic-style lamppost stands prominently in the frame. The lamppost features intricate stone carvings that enhance its medieval charm, with ornate details that showcase architectural elegance. The square is softly illuminated by the warm glow of the lamppost, creating an inviting and atmospheric scene that captures the essence of historic beauty. The camera is positioned to provide a clear, focused view of the lamppost, emphasizing its detailed craftsmanship." + }, + { + "prompt_en": "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the still frame, set in the heart of the old city, a row of ornate lantern-style streetlamps lines the narrow alleyway. These intricately designed lamps cast a warm, welcoming glow that illuminates the cobblestone path below. The light creates soft, elongated shadows that dance along the stone walls of the surrounding buildings, enhancing the historical charm of the scene. The atmosphere is peaceful and nostalgic, evoking a sense of timeless beauty in the old city." + }, + { + "prompt_en": "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the heart of the Utah desert, a massive sandstone arch elegantly spans the horizon. The arch's rich, earthy tones contrast beautifully with the deep blue sky above. The sun casts soft, golden light, accentuating the textured surface of the stone. Sparse greenery dotting the landscape adds a splash of color to the otherwise arid surroundings. The camera pans slowly from left to right, capturing the grandeur of the arch and the serenity of the desert." + }, + { + "prompt_en": "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the Arizona desert, the scene captures a massive stone bridge gracefully arching across a rugged canyon. The bridge, with its majestic arches, casts gentle shadows onto the rocky terrain below. The surrounding desert landscape is bathed in the warm, golden hues of the setting sun, creating a serene and magnificent atmosphere. The camera pans slowly across the scene, highlighting the intricate details of the stone bridge and the vastness of the canyon, emphasizing the tranquil beauty of the desert vista." + }, + { + "prompt_en": "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the corner of a minimalist tea room, a bonsai tree gracefully brings a touch of nature's beauty to the serene setting. The room is characterized by clean lines and subtle tones, creating a simple yet elegant atmosphere. The bonsai, with its intricately shaped branches and lush green foliage, stands on a small wooden table, serving as a captivating focal point. Soft light from a nearby window bathes the tree, highlighting its delicate details and enhancing the tranquil ambiance of the tea room. The camera moves in a gentle arc around the bonsai, capturing its beauty and the harmonious design of the space." + }, + { + "prompt_en": "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, set within the serene ambiance of a traditional tea room, a meticulously arranged tea set takes center stage. Porcelain cups are neatly placed alongside a finely crafted bamboo whisk, all resting on a polished wooden table. The room's soft, diffused lighting enhances the delicate textures and elegant designs of the tea set, creating an atmosphere of calm and refinement. The backdrop features traditional Japanese decor, with tatami mats and shoji screens, adding to the tranquil and respectful space. The camera slowly pans from left to right, capturing the harmonious arrangement of the tea set and the peaceful surroundings." + }, + { + "prompt_en": "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame capturing the serene beauty of a Zen garden, a rustic teahouse stands gracefully. The teahouse features tatami mat seating, arranged in a harmonious pattern that invites quiet contemplation. A traditional charcoal brazier sits centrally, its metal finish offering a contrast to the natural textures of the wood and tatami. Soft, ambient light filters through the paper windows, creating a tranquil atmosphere within the teahouse. Lush greenery surrounds the structure, completing the scene with a touch of nature's elegance. The camera remains still, focusing on the peacefulness emanating from this serene setting." + }, + { + "prompt_en": "A tranquil tableau of a country estate's library featured elegant wooden shelves", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The video showcases a serene scene within a country estate's library, where elegantly carved wooden shelves line the walls, displaying a variety of books bound in rich, leather covers. The lighting is warm and inviting, casting a gentle glow across the room, highlighting the intricate woodwork of the shelving. The camera pans slowly from left to right, capturing the calm and inviting atmosphere, as well as the classic charm and grandeur of the space. A plush armchair and a small wooden table are positioned in front of a large window with soft curtains, completing the tranquil tableau." + }, + { + "prompt_en": "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "Beneath the expansive shade of a solitary oak tree, a weathered wooden park bench rests quietly on the lush grass. The bench, with its old, rustic charm, reveals the marks of time and weathering on its surface. Sunlight filters through the dense canopy of leaves, casting dappled patterns of light and shadow on the bench and the ground around it. The atmosphere is serene and peaceful, enhancing the sense of solitude. The camera gently pans from the top of the oak tree down to the bench, highlighting the contrasting texture of the tree bark and the worn wood." + }, + { + "prompt_en": "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil scene unfolds beside a serene pond, where the elegant branches of a weeping willow tree drape gracefully over the water's surface. The mirror-like pond reflects the tree's cascading foliage, enhancing the scene's peaceful and calming ambiance. Soft, natural lighting bathes this idyllic setting, casting gentle shadows and adding depth to the tableau. The camera slowly pans across the scene, capturing the harmonious interaction between the tree and its reflection on the still water." + }, + { + "prompt_en": "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "A tranquil scene unfolds in a Zen garden, where a meticulously raked gravel path gracefully winds through the serene landscape, leading to a peaceful rock garden. The gravel is arranged in harmonious patterns, creating a sense of order and calm. The rock garden features an artful arrangement of variously sized stones, surrounded by smooth pebbles and minimalist greenery. The overall atmosphere is one of profound stillness and reflection. Soft sunlight filters through the trees, casting gentle shadows across the garden. The camera slowly pans over the path and garden, capturing the harmonious balance of nature and design." + }, + { + "prompt_en": "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame, a serene pond is bordered by elegant weeping cherry trees, their delicate pink blossoms cascading gently through the air. The petals float slowly, creating a soft, picturesque scene as they settle onto the glass-like surface of the pond, forming a scattered, floating carpet of blooms. This peaceful atmosphere is enhanced by long, graceful branches arching over the water, contributing to the tranquil beauty of the scene. The camera is positioned at a slight angle, capturing both the pond and the overhanging branches, showcasing the harmonious interaction between the cherry trees and the water below." + }, + { + "prompt_en": "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In a still frame capturing the historic library's reading room, rows of antique leather chairs and mahogany tables create an atmosphere of serene elegance and literary contemplation. The rich, warm tones of the mahogany tables and the deep, polished leather of the chairs add a sense of timelessness and sophisticated charm. The gentle lighting casts a soft glow across the scene, highlighting the intricate woodwork and the inviting ambiance of this quiet haven. The camera remains still, emphasizing the tranquil and contemplative nature of the space." + }, + { + "prompt_en": "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "The camera pans slowly over a tranquil orchid garden, showcasing a variety of delicate blooms in a serene and peaceful atmosphere. The soft lighting illuminates the vibrant colors of the orchids, highlighting their exquisite petals and intricate patterns. A gentle breeze rustles the leaves, adding a sense of calm and natural beauty to the scene. The camera continues to sweep across the garden, capturing the harmony and quiet elegance of this enchanting space." + }, + { + "prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time", + "dimension": [ + "temporal_flickering" + ], + "refined_prompt": "In the serene courtyard, a centuries-old stone well stands at the center of a tranquil tableau. The well's mossy stones, aged and weathered, exude a sense of history, bearing silent witness to the passage of time. Surrounding the well, the courtyard is enveloped in a peaceful atmosphere, with soft sunlight filtering through the leaves of nearby trees, casting gentle shadows on the ground. The camera captures a steady, wide-angle view, slowly panning across the scene to reveal the well's intricate details and the quiet beauty of the surrounding courtyard." + }, + { + "prompt_en": "a bird and a cat", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bird and cat" + } + }, + "refined_prompt": "A bird is perched on the branch of a tree, with its vibrant feathers illuminated by the warm sunlight filtering through the leaves. Below, a cat prowls stealthily across the grass, its eyes focused on the bird above. The cat, with sleek fur and a curious expression, moves cautiously forward, tail swaying slightly. Meanwhile, the bird occasionally glances down, its feathers ruffling slightly in the gentle breeze. The scene is set in a lush garden, with colorful flowers and dappled light creating a serene atmosphere. The camera slowly pans upward from the cat to the bird, capturing the graceful movements of both animals." + }, + { + "prompt_en": "a cat and a dog", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "cat and dog" + } + }, + "refined_prompt": "A charming cat and a playful dog are in a cozy living room, lounging on a soft carpet. The cat, with its sleek fur and bright eyes, is gently batting its paw at a small toy mouse. Meanwhile, the dog, with its fluffy coat and attentive expression, watches the cat's playful antics with curiosity. The setting is warm and inviting, with soft sunlight filtering in through a nearby window, casting a gentle glow on the carpet where the animals are. The gentle sway of the curtains adds to the calming atmosphere. The camera captures this heartwarming scene with a slow, smooth pan across the room, highlighting the interaction between the two animals." + }, + { + "prompt_en": "a dog and a horse", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "dog and horse" + } + }, + "refined_prompt": "On a lush green field, a beautiful dog is playfully running around a majestic horse. The dog has a sleek coat and a joyful expression, its tail wagging energetically. The horse, with a glossy mane and a strong, elegant stature, stands calmly, occasionally glancing at the lively dog. The scene has a serene and harmonious atmosphere. The camera captures a wide shot, moving slightly to follow the dog's movement, emphasizing the peaceful interaction between the two animals." + }, + { + "prompt_en": "a horse and a sheep", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "horse and sheep" + } + }, + "refined_prompt": "A majestic horse stands beside a fluffy sheep in a lush, green pasture under a bright, clear sky. The horse, with its shiny brown coat and flowing mane, stands tall and gracefully, staring into the distance. The sheep, with its woolly white fleece, grazes contentedly on the grass beside the horse. The scene is peaceful and pastoral, with a gentle breeze rustling the grass. The camera slowly pans from the horse to the sheep, highlighting their companionship and the serene countryside setting." + }, + { + "prompt_en": "a sheep and a cow", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "sheep and cow" + } + }, + "refined_prompt": "In a sunny, open pasture filled with lush green grass, a sheep and a cow stand grazing side by side. The sheep, with its fluffy white wool, is nibbling on the grass, while the cow, with its sleek brown and white coat, chews contentedly nearby. The bright daylight creates a serene and peaceful atmosphere, with a few small daisies scattered around their feet. The camera captures this idyllic scene from a slightly elevated angle, gradually zooming in to focus on the gentle interaction between the two animals." + }, + { + "prompt_en": "a cow and an elephant", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "cow and elephant" + } + }, + "refined_prompt": "A charming scene features a cow and an elephant standing side by side in a lush, green field. The cow is a vibrant brown and white, with its gentle eyes and smooth, glossy coat glistening under the bright sunlight. Beside it, the elephant, a majestic gray giant with expressive eyes, stands calmly, its large ears gently flapping in the breeze. The atmosphere is serene, with a clear blue sky overhead and patches of wildflowers scattered throughout the field. The camera smoothly pans across the scene, highlighting the gentle interaction between these two magnificent creatures against a peaceful landscape." + }, + { + "prompt_en": "an elephant and a bear", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "elephant and bear" + } + }, + "refined_prompt": "In a serene natural setting, an elephant and a bear are positioned side by side on a grassy field. The elephant, with its large ears and majestic trunk, stands calmly, its trunk gently grazing the grass. Next to it, the bear, with a thick fur coat and a curious expression, sits on its hind legs, observing its surroundings. The background features lush greenery and tall trees under a soft, natural light. The camera pans slowly from left to right, capturing the harmony between the two animals in their tranquil environment." + }, + { + "prompt_en": "a bear and a zebra", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bear and zebra" + } + }, + "refined_prompt": "In a serene natural setting, a bear and a zebra stand side by side in a lush green meadow bathed in soft, warm sunlight. The bear, with its thick, dark fur and powerful build, looks curiously towards the zebra, which is characterized by its striking black and white stripes. The two animals stand peacefully together, with tall grass gently swaying in the background. The scene captures an unusual yet harmonious moment in nature. The camera remains steady, framing both animals as they coexist in the tranquil setting." + }, + { + "prompt_en": "a zebra and a giraffe", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "zebra and giraffe" + } + }, + "refined_prompt": "A scene unfolds in a vibrant savannah setting, where a zebra and a giraffe stand side by side under the bright afternoon sun. The zebra, with its distinctive black-and-white stripes, stands calmly on the grassy terrain, while the tall giraffe, with its long neck and patterned coat of brown patches, gracefully gazes around the area. The sky is a clear blue, contrasting beautifully with the animals and the grassland. The camera captures a gentle sweeping motion, providing a panoramic view of this serene wildlife moment." + }, + { + "prompt_en": "a giraffe and a bird", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "giraffe and bird" + } + }, + "refined_prompt": "A tall and graceful giraffe is standing in a sunlit savannah, with golden grass stretching out to the horizon. Perched delicately on one of the giraffe's long, elegant necks is a small, vibrant bird with bright blue and yellow feathers. The giraffe is calmly gazing into the distance, while the bird occasionally flutters its wings, adding a lively touch to the serene scene. The gentle movement of the golden grass in the breeze enhances the tranquil and natural ambiance. The camera slowly pans around the giraffe and bird, capturing their interaction against the backdrop of the vast, open landscape." + }, + { + "prompt_en": "a chair and a couch", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "chair and couch" + } + }, + "refined_prompt": "A plush armchair and a cozy couch are positioned in a well-lit living room setting. The armchair, featuring a soft, velvet upholstery in a deep blue hue, is angled slightly towards the couch. Next to it, the couch is upholstered in a light gray fabric with large cushions and offers ample seating space. Between the two, a small, round coffee table holds a few decorative items, such as a vase with fresh flowers and a book. The soft lighting and warm atmosphere create an inviting and comfortable space. The camera gently pans to show the arrangement from a slightly different angle, enhancing the room's serene ambiance." + }, + { + "prompt_en": "a couch and a potted plant", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "couch and potted plant" + } + }, + "refined_prompt": "A plush, light-colored couch with soft cushions is positioned against a wall in a cozy room. The couch appears inviting, its fabric catching the gentle ambient light. To the left of the couch, a large potted plant with lush, green leaves reaches upward, adding a touch of nature to the scene. The pot is a simple, elegant design. The camera remains static, capturing this serene and comfortable corner of the room, portraying a calm and welcoming atmosphere." + }, + { + "prompt_en": "a potted plant and a tv", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "potted plant and tv" + } + }, + "refined_prompt": "In a cozy living room setting, a green potted plant with broad, lush leaves is placed on a stand beside a sleek flat-screen TV. The TV is mounted on a modern entertainment console, and the screen is off, reflecting the room's ambient lighting. The plant adds a touch of nature and color to the scene, with its leaves fanning out elegantly, partially overlapping the TV's frame. The camera slowly pans from the plant to the TV, capturing the harmonious balance of technology and nature in the room." + }, + { + "prompt_en": "a tv and a laptop", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "tv and laptop" + } + }, + "refined_prompt": "A TV and a laptop are positioned next to each other on a sleek, modern entertainment stand. The TV displays a vibrant and colorful nature scene with lush green forests, while the laptop screen shows a spreadsheet filled with data. The TV's screen is large and commands attention, while the laptop is open and angled slightly towards the TV. The atmosphere is calm, with soft lighting enhancing the screens' brightness, creating a balanced visual contrast between nature and technology. The camera slowly pans from left to right, capturing the TV and then the laptop in sequence." + }, + { + "prompt_en": "a laptop and a remote", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "laptop and remote" + } + }, + "refined_prompt": "A sleek laptop with a thin profile is open on a polished wooden table. Its screen displays a vibrant image, casting a gentle glow onto the keyboard. Next to the laptop, a compact remote is placed with several buttons visible, including a bright red power button that stands out. The scene is well-lit, creating a clear reflection of both the laptop and remote on the table's surface. The camera pans slowly from right to left, emphasizing the harmonious arrangement of the two devices in this modern setup." + }, + { + "prompt_en": "a remote and a keyboard", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "remote and keyboard" + } + }, + "refined_prompt": "A close-up view shows a sleek remote control resting on the surface of a modern computer keyboard. The remote, with a black glossy finish, features various buttons, including volume and channel controls, and a large circular navigation pad. The keyboard, with its slim profile and evenly spaced keys, has a metallic finish that reflects light subtly. The scene is softly lit, creating a polished and sophisticated atmosphere as the camera slowly pans across the surface, highlighting the smooth textures of both devices." + }, + { + "prompt_en": "a keyboard and a cell phone", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "keyboard and cell phone" + } + }, + "refined_prompt": "A close-up of a sleek black and silver keyboard is shown, with the keys illuminated by a soft ambient light. Beside it, a modern smartphone with a glossy screen rests on the desk. The screen of the cell phone displays a vibrant image, casting a subtle glow onto the keyboard. The camera gently pans from left to right, creating a smooth and polished view of the items, enhancing the high-tech and professional atmosphere of the scene." + }, + { + "prompt_en": "a cell phone and a book", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "cell phone and book" + } + }, + "refined_prompt": "A cell phone is placed beside an open book on a wooden table. The phone screen is lit, displaying a vibrant blue and white interface, while the book is open to a page filled with text. Natural light filters in from a nearby window, casting soft shadows on the table and creating a warm, inviting ambiance. The camera gently pans over the scene, highlighting the contrast between modern technology and traditional reading materials." + }, + { + "prompt_en": "a book and a clock", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "book and clock" + } + }, + "refined_prompt": "A hardcover book rests on a wooden table, its cover featuring intricate golden lettering and an elaborate design. Beside the book is a vintage-style clock with a round face, Roman numerals, and elegant metal hands pointing to 3:45. The scene is warmly lit, casting soft shadows across the table and highlighting the textures of the book cover and the clock's ornate frame. The camera slowly pans from left to right, capturing the detailed craftsmanship of both the book and the clock in close-up." + }, + { + "prompt_en": "a clock and a backpack", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "clock and backpack" + } + }, + "refined_prompt": "A close-up view features a stylish clock and a backpack on a wooden table, with soft ambient lighting creating a warm atmosphere. The clock has a sleek design with a black rim and clear glass face, displaying elegantly crafted hour and minute hands pointing at 3 o'clock. Next to the clock, the backpack is made of a rugged canvas material, showcasing sturdy leather straps and a zippered pocket. The camera shifts slightly to create a gentle dynamic effect, highlighting the details of the clock's face and the texture of the backpack." + }, + { + "prompt_en": "a backpack and an umbrella", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "backpack and umbrella" + } + }, + "refined_prompt": "A stylish backpack, made of leather with detailed stitching, is placed on a wooden bench. Next to the backpack, a compact, black umbrella with a sleek, glossy finish is leaning against the bench. The bench is situated in a serene park setting, with soft sunlight filtering through the trees, casting dappled shadows on the ground. The scene captures the tranquility and preparedness for a sudden change in weather, evoking a relaxed and peaceful atmosphere." + }, + { + "prompt_en": "an umbrella and a handbag", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "umbrella and handbag" + } + }, + "refined_prompt": "An umbrella and a handbag are placed side by side on a sandy beach. The umbrella is open, displaying a vibrant blue and white striped pattern, providing shade and a striking contrast against the sand. The handbag, made of light beige wicker with leather handles, rests next to the umbrella on the sand, partially shaded by the umbrella's canopy. The sunlight highlights the textures of both items, creating an inviting and relaxed beach atmosphere. Shadows cast by the umbrella and handbag add dimension to the scene." + }, + { + "prompt_en": "a handbag and a tie", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "handbag and tie" + } + }, + "refined_prompt": "A sleek, elegant handbag and a stylish tie are displayed side by side on a polished wooden table. The handbag, made of smooth leather, features a glossy finish and subtle stitching details. Its handle rests gently on the table, creating a shadow in the soft, ambient lighting. The tie, made of fine silk, features a sophisticated pattern of diagonal stripes in complementary colors. Its fabric gleams slightly under the warm light, enhancing its luxurious appearance. The camera slowly pans from left to right, showcasing the craftsmanship and design of both the handbag and the tie." + }, + { + "prompt_en": "a tie and a suitcase", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "tie and suitcase" + } + }, + "refined_prompt": "A neatly positioned black necktie is coiled elegantly over the handle of a polished black leather suitcase. The suitcase, with shiny metallic locks, is set against a neutral background, exuding a sense of sophistication and professionalism. The camera slowly zooms in, capturing the rich textures of the tie and the gleaming surface of the suitcase, highlighting their stylish appeal and attention to detail." + }, + { + "prompt_en": "a suitcase and a vase", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "suitcase and vase" + } + }, + "refined_prompt": "A stylish suitcase sits on a wooden table next to a delicate porcelain vase. The suitcase is medium-sized, with a textured leather finish and metal clasps that catch the light. It is positioned with its handle facing upwards. Beside it, the vase stands gracefully, adorned with intricate floral patterns in soft pastel colors. The vase is filled with fresh flowers, their vibrant petals adding a touch of color to the scene. The overall atmosphere is calm and elegant, enhanced by soft natural lighting illuminating the objects from one side. The camera gently pans from the suitcase to the vase, showcasing their details and creating a harmonious composition." + }, + { + "prompt_en": "a vase and scissors", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "vase and scissors" + } + }, + "refined_prompt": "On a polished wooden table, a beautifully crafted ceramic vase sits alongside a pair of metallic scissors. The vase is elegantly designed, with intricate floral patterns in soft blue and white hues. It holds a few freshly cut flowers, which add a touch of vibrant color against the vase's delicate design. The scissors, with shiny metal blades and black handles, are positioned beside the vase, partially open, suggesting recent use. Soft sunlight filters through a window, casting gentle shadows and highlighting the textures of both items, creating a serene and inviting atmosphere." + }, + { + "prompt_en": "scissors and a teddy bear", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "scissors and teddy bear" + } + }, + "refined_prompt": "A pair of shiny, metallic scissors lies on a wooden table next to a small, plush teddy bear. The scissors are open, with the blades reflecting ambient light and highlighting their sharpness. The teddy bear is soft and fluffy, with tan fur, button eyes, and a cute little bowtie around its neck. The camera smoothly pans across the scene from the teddy bear towards the open scissors, capturing the textures and details of both objects under warm, cozy lighting." + }, + { + "prompt_en": "a teddy bear and a frisbee", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "teddy bear and frisbee" + } + }, + "refined_prompt": "A charming teddy bear sits upright on a lush green lawn, looking adorable with its soft, plush fur and a small red bow around its neck. Nearby, a bright blue frisbee lies on the grass, slightly tilted. The teddy bear seems to be gazing at the frisbee with curious innocence. Sunlight filters through the leaves of overhead trees, casting a gentle dappled light on the scene and creating a warm and inviting atmosphere. The camera captures this serene moment with a slow, sweeping movement from left to right, emphasizing the playful and peaceful setting." + }, + { + "prompt_en": "a frisbee and skis", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "frisbee and skis" + } + }, + "refined_prompt": "A vibrant orange frisbee lies on a snowy terrain, partially buried in the snow. Nearby, a pair of sleek skis stand upright, their pointed tips leaning slightly towards each other, casting subtle shadows on the snow. The sun shines brightly, illuminating the scene and creating a crisp, clear atmosphere. The camera pans slowly across the scene, capturing the dynamic contrast between the playful frisbee and the stationary skis amidst the serene snowy landscape." + }, + { + "prompt_en": "skis and a snowboard", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "skis and snowboard" + } + }, + "refined_prompt": "On a snowy slope, a pair of skis and a snowboard are positioned close to each other, partially buried in fresh powder. The skis are sleek and colorful, with vibrant patterns along their surfaces, while the snowboard features a bold graphic design. The snow reflects a bright, clear sky, enhancing the vibrant colors of the equipment. The camera starts with a close-up of the skis, then smoothly pans to the snowboard, capturing the snowy landscape and nearby trees in the background." + }, + { + "prompt_en": "a snowboard and a sports ball", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "snowboard and sports ball" + } + }, + "refined_prompt": "The scene features a snowboard and a sports ball placed side by side on a patch of fresh, powdered snow. The snowboard has a sleek design with vibrant blue and white stripes, while the sports ball, a soccer ball, showcases a classic black and white pattern. The snow sparkles under the bright sunlight, creating a crisp and energetic atmosphere. A gentle breeze blows, causing a slight flutter of snowflakes across the scene, enhancing the fresh and wintry feel. The camera slowly pans from left to right, capturing the contrast between the snowboard and the ball against the pristine snowy backdrop." + }, + { + "prompt_en": "a sports ball and a kite", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "sports ball and kite" + } + }, + "refined_prompt": "A vibrant scene features a sports ball and a colorful kite on a sunlit field. The sports ball, with its distinctive pattern, rests on the lush green grass, casting a small shadow in the bright sunlight. Above the field, a kite in vivid hues of red, blue, and yellow soars against the clear blue sky. The kite's tail dances gracefully in the breeze, creating a playful movement. The atmosphere is lively and fresh, with a gentle wind causing the grass blades to sway slightly. The camera pans upward from the sports ball to capture the kite's flight, enhancing the sense of freedom and joy." + }, + { + "prompt_en": "a kite and a baseball bat", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "kite and baseball bat" + } + }, + "refined_prompt": "A vibrant kite, adorned with colorful patterns, floats gracefully in the sky against a backdrop of fluffy white clouds. Below, a baseball bat is positioned upright, planted in the sandy ground of a beach. The kite's tail dances in the breeze, creating a playful movement, while the baseball bat casts a small shadow on the sand. The scene captures the essence of a playful, carefree day at the beach. The camera slowly pans upward from the baseball bat to the kite, emphasizing the contrast between the grounded bat and the soaring kite." + }, + { + "prompt_en": "a baseball bat and a baseball glove", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "baseball bat and baseball glove" + } + }, + "refined_prompt": "A baseball bat is leaning against a wooden fence, with a well-worn baseball glove placed next to it on the ground. The bat has a smooth, polished wooden surface with a branded logo near its handle. The glove is made of brown leather, with visible stitching and a pocket designed for catching baseballs. Sunlight streams through, casting soft shadows on the bat and glove, creating a warm and nostalgic atmosphere. The camera gently moves from left to right, slowly revealing these two iconic pieces of baseball equipment." + }, + { + "prompt_en": "a baseball glove and a skateboard", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "baseball glove and skateboard" + } + }, + "refined_prompt": "A baseball glove and a skateboard are placed on a smooth, concrete surface. The baseball glove is positioned open, with its tan leather showcasing intricate stitching and weathered texture. The skateboard, featuring a vibrant graphic design on its deck, lays beside the glove. The wheels of the skateboard cast subtle shadows on the ground. The camera slowly pans over these objects, highlighting their details against the contrast of the sunlit backdrop." + }, + { + "prompt_en": "a skateboard and a surfboard", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "skateboard and surfboard" + } + }, + "refined_prompt": "A skateboard and a surfboard are placed side by side on a sandy beach. The skateboard, to the left, features a vibrant blue deck with black grip tape and bright yellow wheels, casting gentle shadows on the sand. Next to it, the surfboard stretches out, its sleek white surface with bold red stripes reflecting the sun's light. The camera slowly pans from left to right, capturing the contrast between the skateboard's urban style and the surfboard's oceanic vibe, with the gentle sound of waves in the background adding to the serene atmosphere." + }, + { + "prompt_en": "a surfboard and a tennis racket", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "surfboard and tennis racket" + } + }, + "refined_prompt": "A surfboard and a tennis racket are placed on a sandy beach. The surfboard, with a shiny blue and white design, lies flat with its nose pointing towards the ocean, highlighting the sand's texture around it. Just beside it, the tennis racket, with its black frame and tightly strung strings, rests angled slightly upward, and small pebbles surround its handle. Both objects are illuminated by the warm glow of the setting sun, casting soft shadows onto the sand. The camera pans lightly from left to right, capturing the serene beach atmosphere." + }, + { + "prompt_en": "a tennis racket and a bottle", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "tennis racket and bottle" + } + }, + "refined_prompt": "A tennis racket is placed next to a bottle on a wooden surface. The tennis racket has a sleek design with black and neon green accents, and its strings are tightly woven. The bottle is clear and half-filled with water, reflecting light from a nearby source, creating a subtle shine. The combination of the sporty tennis racket and the refreshing bottle suggests a preparation for an active session. The scene is evenly lit, providing a crisp and clear view of both objects. The camera takes a slow, smooth pan across the scene, capturing the details of the racket and the bottle together." + }, + { + "prompt_en": "a bottle and a chair", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bottle and chair" + } + }, + "refined_prompt": "A single bottle stands upright on a small table, while a chair sits nearby. The bottle is made of clear glass and is filled with a vibrant blue liquid, reflecting the ambient light softly. The chair, crafted from dark wood, has a simple yet elegant design and is positioned slightly angled towards the bottle. The room is softly lit, with shadows creating a cozy atmosphere. The camera gently pans from the bottle to the chair, highlighting the contrast between the two objects in this serene setting." + }, + { + "prompt_en": "an airplane and a train", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "airplane and train" + } + }, + "refined_prompt": "A passenger airplane is flying across a clear blue sky, its sleek body glistening in the sunlight. Below, a high-speed train moves swiftly along a railroad track set amidst a scenic countryside. The train's modern, streamlined design complements the airplane's grace as both vehicles continue their respective journeys. The camera follows the synchronized movement of the airplane above and the train below, capturing the harmony between air and land travel against a picturesque landscape." + }, + { + "prompt_en": "a train and a boat", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "train and boat" + } + }, + "refined_prompt": "A sleek, modern train moves briskly along parallel tracks, heading towards the camera. Its shiny exterior reflects the bright daylight, and its windows glint in the sun. Meanwhile, a small, vibrant boat drifts gently on a nearby calm river, moving slightly to the left in the frame. The boat's colorful hull contrasts with the earthy tones of the riverbank. The camera pans slowly from right to left, capturing both the rhythmic motion of the train and the gentle sway of the boat in harmony with the serene surroundings." + }, + { + "prompt_en": "a boat and an airplane", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "boat and airplane" + } + }, + "refined_prompt": "A sleek, modern boat is seen cruising across calm waters with its bow cutting gracefully through the waves. Above, a white airplane flies smoothly across the clear blue sky, leaving a faint trail behind. The boat is adorned with a white hull and has an elegant design, while the airplane glides effortlessly, reflecting the sunlight on its wings. The camera captures both the boat and the airplane in frame, maintaining a steady view as the scene unfolds, with the horizon visible in the distance." + }, + { + "prompt_en": "a bicycle and a car", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bicycle and car" + } + }, + "refined_prompt": "A shiny, red bicycle stands parked beside a sleek, silver car on the side of a quiet street. The bicycle's bright color and streamline design emphasize its modern style, with its handlebars angled just slightly towards the car. The car's polished surface reflects the surrounding environment, capturing the soft light of the afternoon sun. The setting is peaceful, with leaves scattered on the ground, adding to the tranquil atmosphere. The camera slowly pans from the bicycle to the car, highlighting the contrast between the two vehicles." + }, + { + "prompt_en": "a car and a motorcycle", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "car and motorcycle" + } + }, + "refined_prompt": "A sleek, silver car and a stylish, black motorcycle are parked side by side on a smooth asphalt road. The car is a modern sedan with tinted windows, giving it a sophisticated appearance. Next to it, the motorcycle is a sporty model with a glossy finish and chrome accents, exuding a sense of speed and agility. The scene is illuminated by the soft, ambient light of a clear day, highlighting the polished surfaces of both vehicles. The camera slowly pans from left to right, capturing the details of the car and motorcycle, set against a backdrop of distant trees and an open sky." + }, + { + "prompt_en": "a motorcycle and a bus", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "motorcycle and bus" + } + }, + "refined_prompt": "On a busy city street, a sleek, red motorcycle is parked next to a large, blue bus. The motorcycle, with its shiny chrome details and leather seat, stands out against the bus's larger, more imposing structure. The bus has clear windows and an advertisement displayed on the side. The scene is set under bright daylight, highlighting the contrast between the compact motorcycle and the expansive bus. The camera slowly pans around the two vehicles, capturing their size difference and the bustling city environment around them." + }, + { + "prompt_en": "a bus and a traffic light", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bus and traffic light" + } + }, + "refined_prompt": "A bright yellow bus is approaching a traffic light at a busy intersection. The traffic light hangs from a metal pole, and the light is currently green, indicating that the bus can continue moving forward. The bus has large windows and is clean and polished, reflecting the surrounding city lights. As the bus moves smoothly towards the camera, the cityscape in the background features tall buildings and bustling activity. The camera gently pans to follow the bus, capturing the urban atmosphere of the street." + }, + { + "prompt_en": "a traffic light and a fire hydrant", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "traffic light and fire hydrant" + } + }, + "refined_prompt": "In a bustling urban setting, a traffic light stands on a street corner, displaying a bright red light. Nearby, a fire hydrant painted in a vibrant yellow color is positioned on the sidewalk. The backdrop features a blurred view of city buildings and moving traffic, creating a sense of constant activity and movement. The camera does a slow zoom from afar to close up, capturing both the traffic light and the fire hydrant in a single frame, highlighting their vivid colors amidst the city's backdrop. The overall atmosphere is lively and energetic." + }, + { + "prompt_en": "a fire hydrant and a stop sign", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "fire hydrant and stop sign" + } + }, + "refined_prompt": "A red fire hydrant and a stop sign are positioned on a sidewalk corner. The fire hydrant, with its glossy red paint and metallic cap, stands prominently in the foreground. Just behind it, the stop sign is mounted on a silver pole, featuring a bright red octagonal shape with bold white lettering. The scene is set against a backdrop of greenery from nearby trees, with sunlight casting gentle shadows on the sidewalk, creating a calm and serene atmosphere. A gentle breeze causes the leaves to rustle softly. The camera captures a subtle movement, slightly panning to the right to reveal more of the sidewalk and street." + }, + { + "prompt_en": "a stop sign and a parking meter", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "stop sign and parking meter" + } + }, + "refined_prompt": "The scene features a bright red stop sign positioned prominently on the left side of the frame. The sign is clearly visible with its bold white lettering catching the sunlight. To the right, a parking meter stands tall, its metallic surface reflecting the light. The parking meter has a digital display and buttons on its front. The background suggests a quiet urban street, with buildings out of focus in the distance. The camera pans slightly from left to right, emphasizing the relationship between the stop sign and the parking meter in the setting." + }, + { + "prompt_en": "a parking meter and a truck", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "parking meter and truck" + } + }, + "refined_prompt": "A close-up shot shows a parking meter positioned on a city street corner. Beside it, a rugged yet sleek truck is parked, facing away from the camera. The truck's polished silver finish glistens under the bright, midday sun. The parking meter displays a digital screen with red and green buttons visible. Its metallic surface complements the truck's shiny exterior. The camera pans slightly to capture the details of both the parking meter and the truck, highlighting each object's modern yet utilitarian design against the urban backdrop." + }, + { + "prompt_en": "a truck and a bicycle", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "truck and bicycle" + } + }, + "refined_prompt": "A large truck is parked on the side of a quiet road, while a sleek bicycle leans against it. The truck is painted a bright blue, with its doors closed and the sun glancing off its clean surface. The bicycle, with a shiny metallic frame and black tires, stands nearby, creating a striking contrast against the truck. The road and surrounding area are relatively empty, adding a sense of calmness to the scene. The shot remains static, focusing on the interaction between the two objects." + }, + { + "prompt_en": "a toilet and a hair drier", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "toilet and hair drier" + } + }, + "refined_prompt": "In a brightly lit bathroom, a modern white toilet with a sleek design is positioned next to a wall-mounted hair dryer. The hair dryer has a coiled cord and is secured in a holder attached to the wall above the toilet. The lighting accentuates the clean and polished surfaces, creating a fresh and hygienic atmosphere. The camera holds a steady angle capturing both the toilet and the hair dryer in the frame, highlighting their placement and functionality in the bathroom setting." + }, + { + "prompt_en": "a hair drier and a toothbrush", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "hair drier and toothbrush" + } + }, + "refined_prompt": "A scene features a hair dryer and a toothbrush placed side by side on a clean, white bathroom countertop. The hair dryer is sleek and modern, with a metallic finish and an ergonomic handle. The nozzle is slightly pointed downwards, as if ready to be picked up. Next to it, the toothbrush has a colorful handle, and soft bristles, standing upright in a small holder. The lighting is bright and crisp, highlighting the shiny surface of the hair dryer and the vibrant colors of the toothbrush. The camera slowly pans from left to right, showcasing both items clearly." + }, + { + "prompt_en": "a toothbrush and a sink", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "toothbrush and sink" + } + }, + "refined_prompt": "A toothbrush is positioned next to a modern, sleek sink. The toothbrush features a vibrant blue handle with white bristles. The sink, made of shiny ceramic, is clean and well-maintained, with a streamlined chrome faucet. A few droplets of water glisten on the sink's edge, reflecting the bright, ambient light in the bathroom. The camera pans slowly from the toothbrush to the sink, capturing the smooth textures and polished surfaces, creating a refreshing and hygienic atmosphere." + }, + { + "prompt_en": "a sink and a toilet", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "sink and toilet" + } + }, + "refined_prompt": "In a bright and clean bathroom, a white porcelain sink and matching toilet are prominently displayed. The sink, featuring a sleek silver faucet, is positioned to the left, with a bar of soap resting neatly on its rim. The toilet, with its smooth, rounded design, stands to the right, accompanied by a small, elegantly folded bathroom towel on its lid. The walls are tiled in a light, neutral color, enhancing the clean and serene atmosphere. Soft, ambient lighting casts a gentle glow across the entire scene." + }, + { + "prompt_en": "a wine glass and a chair", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "wine glass and chair" + } + }, + "refined_prompt": "A wine glass filled with rich red wine is positioned elegantly on a wooden table. Behind the glass, there's a sleek, modern chair with a slightly curved backrest, featuring an upholstered seat in a muted tone. Soft, ambient lighting creates a warm and inviting atmosphere, casting gentle shadows and reflections. The camera slowly pans from the wine glass to the chair, capturing the sophisticated simplicity of the scene." + }, + { + "prompt_en": "a cup and a couch", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "cup and couch" + } + }, + "refined_prompt": "A close-up shot shows a ceramic cup with an intricate floral design resting on the armrest of a plush couch. The cup holds steaming coffee, with wisps of steam curling up into the air, adding a cozy and inviting atmosphere. The couch is upholstered in a soft, textured fabric with a warm, neutral hue, contributing to an overall feeling of comfort. The camera captures the gentle play of light casting subtle shadows, creating a peaceful and serene mood in the setting." + }, + { + "prompt_en": "a fork and a potted plant", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "fork and potted plant" + } + }, + "refined_prompt": "A shiny stainless steel fork is angled towards the camera with its prongs gleaming under soft natural lighting. In the background, a small potted plant is visible. The plant has vibrant green leaves cascading over the edges of a simple white pot. The camera gently pans to keep both the fork and the potted plant in view, creating a serene and harmonious atmosphere." + }, + { + "prompt_en": "a knife and a tv", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "knife and tv" + } + }, + "refined_prompt": "A sleek, stainless steel knife is resting on a polished kitchen counter, its sharp blade catching the ambient light. In the background, a modern flat-screen TV is mounted on the wall. The TV displays a vibrant scene with colorful, fast-moving images that reflect faintly on the knife's surface, adding an interesting interplay of light and color. The scene captures the contrast between the static knife and the dynamic visuals on the TV. The camera pans slightly from left to right, highlighting both objects in the frame." + }, + { + "prompt_en": "a spoon and a laptop", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "spoon and laptop" + } + }, + "refined_prompt": "A spoon and a laptop are positioned on a wooden table, creating a contrast between the modern and traditional elements. The laptop is open, displaying a bright, colorful screen with a background image, while the spoon lies nearby, its metallic surface reflecting light. The camera pans slowly from the spoon upwards to capture the laptop's sleek design, highlighting the tech-savvy atmosphere of the scene. Soft, warm lighting enhances the cozy and intimate setting." + }, + { + "prompt_en": "a bowl and a remote", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bowl and remote" + } + }, + "refined_prompt": "A simple yet inviting scene features a bowl and a remote control on a wooden coffee table. The bowl, made of ceramic, is filled with colorful fruit, including a bright red apple and a cluster of vibrant green grapes. The remote control, sleek with a shiny black finish, is placed beside the bowl, with its buttons clearly visible. Soft, natural light filters through a nearby window, casting gentle shadows and creating a warm, cozy atmosphere. The camera captures this tranquil arrangement with a smooth, slightly elevated angle, highlighting the contrast between the organic shapes of the fruit and the structured form of the remote." + }, + { + "prompt_en": "a banana and a keyboard", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "banana and keyboard" + } + }, + "refined_prompt": "A banana rests casually on the left side of a keyboard, which has a sleek and modern design with black keys and a minimalist layout. The banana is perfectly ripe, with a vivid yellow peel displaying a slight sheen under ambient lighting, adding warmth to the scene. The camera gradually pans from left to right, showcasing the keyboard's smooth keys, which are evenly spaced and catch reflections from the surrounding light. This creates a simple yet visually pleasing contrast between the natural shape of the banana and the structured arrangement of the keyboard." + }, + { + "prompt_en": "an apple and a cell phone", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "apple and cell phone" + } + }, + "refined_prompt": "An apple and a cell phone are placed side by side on a wooden table. The apple is perfectly round and vibrant red, with a stem and a single green leaf. The cell phone is sleek and modern, with a glossy black screen reflecting the ambient light. The camera slowly pans in from the right, highlighting the contrast between the natural texture of the apple and the smooth, polished surface of the phone. The table’s grains provide a rustic background, adding to the overall composition." + }, + { + "prompt_en": "a sandwich and a book", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "sandwich and book" + } + }, + "refined_prompt": "A neatly arranged sandwich sits on a clean white plate beside an open book. The sandwich is layered with fresh lettuce, slices of ripe tomato, and a few slices of turkey, all tucked between lightly toasted bread. The book is opened to reveal crisp, neatly printed pages, inviting a reader to enjoy a quiet moment of relaxation. The warm lighting casts a cozy glow on the scene, highlighting the textures of both the sandwich and the book. The camera gently pans from the sandwich to the book, creating a serene and inviting atmosphere." + }, + { + "prompt_en": "an orange and a clock", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "orange and clock" + } + }, + "refined_prompt": "A close-up shot reveals an orange sitting next to a small, vintage-style clock on a wooden table. The clock has a round face with Roman numerals and elegant hands, displaying the time. The orange, with its bright, textured peel, is positioned beside the clock. The lighting is warm and soft, creating a cozy atmosphere. The camera slowly moves around, offering different angles of the orange and clock, emphasizing the contrast between the organic fruit and the mechanical timepiece." + }, + { + "prompt_en": "broccoli and a backpack", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "broccoli and backpack" + } + }, + "refined_prompt": "A fresh head of broccoli rests on a smooth surface next to a stylish, open backpack. The broccoli's vibrant green florets stand out against the darker surface, accentuated by soft, natural lighting that highlights its fresh texture. The backpack is a rich, earthy brown, with intricate stitching along its seams and a partially opened zipper that reveals a glimpse of its interior. The scene is set against a simple background, with the camera slowly panning from left to right, capturing the contrasting colors and textures of the broccoli and the backpack." + }, + { + "prompt_en": "a carrot and an umbrella", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "carrot and umbrella" + } + }, + "refined_prompt": "A vibrant orange carrot lies on a wooden surface, its green leafy top extending outward. Next to the carrot, a colorful umbrella with a fabric canopy is partially open, showcasing its bright stripes of yellow, blue, and red. The umbrella's metal frame glistens under bright lighting, casting a playful shadow on the surface. The camera zooms in slightly to focus on the contrast between the natural texture of the carrot and the smooth, synthetic material of the umbrella." + }, + { + "prompt_en": "a hot dog and a handbag", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "hot dog and handbag" + } + }, + "refined_prompt": "A hot dog is resting on a smooth wooden surface with vibrant toppings of mustard, ketchup, and relish adding a burst of color. Beside it, a stylish handbag crafted from textured leather in a rich shade of brown is placed elegantly. The handbag features a sleek metal buckle and detailed stitching, emphasizing its refinement. The camera slowly pans from the hot dog to the handbag, highlighting the contrasting elements of casual fast food and fashionable accessories in a sophisticated setting." + }, + { + "prompt_en": "a pizza and a tie", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "pizza and tie" + } + }, + "refined_prompt": "A pizza rests on a simple wooden table, its crust golden-brown and topped with a variety of colorful ingredients, including bright red tomato slices, fresh green basil leaves, and melted cheese. Next to the pizza, a neatly folded tie is placed, contrasting sharply with the rustic food. The tie is dark blue with a subtle, elegant pattern, carefully arranged so both items share prominence in the shot. The camera slightly pans to the right, offering a closer look at the steaming pizza and the tie's intricate pattern, set against a softly lit, cozy atmosphere." + }, + { + "prompt_en": "a donut and a suitcase", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "donut and suitcase" + } + }, + "refined_prompt": "A single donut with colorful sprinkles rests atop a stylish, modern suitcase. The donut, with its golden-brown texture and vibrant sprinkles, stands out vividly against the suitcase's sleek, metallic surface. The suitcase, with polished edges and horizontal grooves running along its body, has an air of sophistication. The camera moves slowly around the scene, highlighting the contrast between the playful donut and the elegant suitcase. The lighting casts soft shadows, emphasizing the textures and colors." + }, + { + "prompt_en": "a cake and a vase", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "cake and vase" + } + }, + "refined_prompt": "A beautifully decorated cake and an elegant vase are displayed on a wooden table. The cake is adorned with intricate pink frosting swirls and topped with a small bouquet of fresh, vibrant flowers. Next to it, the vase, made of clear glass, holds a charming arrangement of delicate purple and white flowers. The scene is lit with soft, natural lighting, creating a warm and inviting atmosphere. The camera slowly zooms in to capture the detailed decorations on the cake, then shifts focus to capture the vivid colors and textures of the flowers in the vase." + }, + { + "prompt_en": "an oven and scissors", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "oven and scissors" + } + }, + "refined_prompt": "In a kitchen setting, an oven with a sleek stainless steel finish is featured, illuminated by soft, ambient lighting. On the counter next to the oven lies a shiny pair of scissors, reflecting some light from the room. The scissors are open slightly, and they rest atop a simple wooden cutting board. The camera angle is slightly elevated, providing a clear view of both the oven and the scissors. The scene maintains a calm and organized ambiance, emphasizing the kitchen’s functional yet stylish design." + }, + { + "prompt_en": "a toaster and a teddy bear", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "toaster and teddy bear" + } + }, + "refined_prompt": "On a kitchen countertop, a shiny stainless steel toaster is positioned next to a plush teddy bear. The toaster has two slices of bread popping up, with steam gently rising from them, indicating they are freshly toasted. The teddy bear, with soft brown fur and a cheerful expression, sits beside the toaster, as if watching the toasting process. Natural sunlight streams in through a nearby window, creating a warm and inviting atmosphere. The camera subtly zooms in to capture the fine details of the teddy bear's fur and the toaster's reflective surface." + }, + { + "prompt_en": "a microwave and a frisbee", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "microwave and frisbee" + } + }, + "refined_prompt": "A microwave, with a sleek stainless steel exterior, sits on a countertop with its door slightly open. To the right of the microwave, a brightly colored frisbee, with a blue and white pattern, is positioned upright, leaning against the microwave. The countertop is clean and simple, with a neutral background that highlights the futuristic design of the microwave and the vivid colors of the frisbee. The camera remains static, providing a clear view of both objects in a well-lit kitchen setting." + }, + { + "prompt_en": "a refrigerator and skis", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "refrigerator and skis" + } + }, + "refined_prompt": "A sleek, modern refrigerator stands in a kitchen with stainless steel doors that reflect its surroundings. Next to the refrigerator, a pair of skis is leaning against the wall. The skis have a vibrant, colorful design with contrasting patterns, creating an interesting visual juxtaposition with the metallic texture of the refrigerator. The lighting is bright and even, highlighting the clean and organized appearance of the kitchen space. The camera slowly pans from the top of the refrigerator down to the skis, emphasizing the contrast between the two objects." + }, + { + "prompt_en": "a bicycle and an airplane", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "bicycle and airplane" + } + }, + "refined_prompt": "A bicycle is parked on a runway, positioned in the foreground, with a large airplane visible in the distance. The bicycle has a sleek frame with a dark blue color and black tires, resting on its kickstand. In the background, an airplane is approaching the runway, its landing gear extended as it prepares to touch down. The scene is set under a clear blue sky, and the sunlight casts sharp shadows of the bicycle onto the runway. The perspective captures the contrast in size and function between the two modes of transportation." + }, + { + "prompt_en": "a car and a train", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "car and train" + } + }, + "refined_prompt": "On a sunny day, a sleek, red sports car is parked near a railway crossing, its polished surface glinting in the sunlight. In the distance, a powerful, modern train approaches, moving steadily along the tracks. As the train comes into closer view, its cars and windows reflect the bright sky. The scene captures the moment just before the train crosses the track, with the car positioned safely at the edge of the crossing, emphasizing the contrast between the stationary car and the dynamic movement of the approaching train." + }, + { + "prompt_en": "a motorcycle and a boat", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "motorcycle and boat" + } + }, + "refined_prompt": "A sleek motorcycle, with a polished chrome finish and gleaming black body, is parked on the edge of a wooden dock. Next to the dock, a small, stylish speedboat gently rocks with the subtle motion of the water. The boat features a striking white hull with a glossy red stripe along the side. The scene is set against a clear blue sky, and the sunlight glints off the surfaces of both the motorcycle and the boat, highlighting their contours and details. The camera slowly pans from the motorcycle to the boat, capturing the relaxed yet adventurous atmosphere of the setting." + }, + { + "prompt_en": "a person and a toilet", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "person and toilet" + } + }, + "refined_prompt": "A woman is bending over slightly as she cleans a toilet in a bright, clean bathroom. She is wearing casual clothing, including a light blue blouse and dark jeans. Her hair is neatly pulled back, and she appears focused on her task. The toilet is spotless, and the surrounding bathroom area is well-organized with white tiles and a small window allowing natural light. The woman's expression is one of concentration as she diligently works. The camera captures this scene from a side angle, giving a clear view of her actions and the cleanliness of the environment." + }, + { + "prompt_en": "a person and a hair drier", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "person and hair drier" + } + }, + "refined_prompt": "A woman with long, flowing hair stands in a cozy bedroom, gently drying her hair with a sleek, modern hair dryer. The soft lighting accentuates her charming features, including her bright eyes and warm smile. She is wearing a casual, comfortable sweater, and her posture is relaxed yet focused on her task. The camera slowly zooms in to capture the smooth motion of her hair as it sways with the warm air from the dryer, creating a peaceful and serene atmosphere." + }, + { + "prompt_en": "a person and a toothbrush", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "person and toothbrush" + } + }, + "refined_prompt": "A young woman is standing in front of a bathroom mirror, holding an electric toothbrush. She has long, glossy hair and is dressed in a light, casual top. She gazes at herself in the mirror with a bright, charming smile as she turns on the toothbrush. The toothbrush hums softly, its bristles gently vibrating. The reflection of the bathroom lights casts a soft glow over her, creating a warm and inviting atmosphere." + }, + { + "prompt_en": "a person and a sink", + "dimension": [ + "multiple_objects" + ], + "auxiliary_info": { + "multiple_objects": { + "object": "person and sink" + } + }, + "refined_prompt": "A woman stands in front of a modern bathroom sink, with her hands under the running water. She appears to be thoroughly washing her hands, creating a cascading flow of water around her fingers. The sink is sleek, with a polished silver faucet and a glossy ceramic basin. The woman has shoulder-length, wavy hair and wears a light-colored blouse. Her expression is focused and serene as she looks down at her hands. The ambient lighting is soft, creating a calm and clean atmosphere in the bathroom. The camera gently zooms in to capture the water flow and her hand movements in more detail." + }, + { + "prompt_en": "A person is riding a bike", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man is riding a bike along a scenic pathway, with lush green trees lining both sides. He is wearing a casual t-shirt and shorts, and his hair flows gently in the breeze as he pedals forward. The sun filters through the trees, creating a dappled light effect on the path. The camera follows his movement smoothly, capturing the rhythm of his pedaling and the serene, natural surroundings." + }, + { + "prompt_en": "A person is marching", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is marching with purposeful strides along a paved walkway in a park, surrounded by lush greenery. He is a handsome young man in his twenties, wearing a crisp white shirt and dark trousers. His expression is focused and determined, and his posture is upright, conveying confidence. The camera follows him, slightly elevated, capturing his rhythmic movements as he marches forward towards the camera, while the tree-lined path creates a serene and natural backdrop." + }, + { + "prompt_en": "A person is roller skating", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young, athletic woman is roller skating gracefully on a smooth path. She has long flowing hair and wears a stylish, fitted tank top and shorts. As she skates towards the camera, her movements are fluid and controlled, effortlessly gliding forward with a joyful expression on her face. The sun casts a warm, golden glow, creating a lively and energetic atmosphere. The camera smoothly follows her motion, capturing her skilled and elegant skating." + }, + { + "prompt_en": "A person is tasting beer", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is tasting beer in a cozy pub setting with warm ambient lighting. He is handsome, with short, neatly styled hair, and is dressed casually in a dark shirt. The man lifts a clear glass of amber beer to his lips, savoring the aroma before taking a small sip. His expression is thoughtful and appreciative as he tastes the beer, and he slightly nods as if savoring the flavor. The scene is framed with wooden tables and shelves filled with various bottles in the background, adding to the inviting and relaxed atmosphere. The camera slowly zooms in, capturing the thoughtful expression on his face." + }, + { + "prompt_en": "A person is clapping", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is standing and clapping her hands with a bright smile. She has long, wavy hair cascading down her shoulders and is wearing an elegant blue dress that accentuates her charm. Her eyes are sparkling with joy as she applauds enthusiastically. The camera captures her from the front, zooming in slightly to highlight her expressive facial features and graceful movements, creating a warm and celebratory atmosphere." + }, + { + "prompt_en": "A person is drawing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is seated at a desk, deeply focused on drawing. Her long, wavy hair cascades over her shoulders, framing her delicate features. She is dressed in a light, flowing blouse, giving off a charming and serene appearance. The woman holds a pencil gracefully as she sketches on a sheet of paper, her hand moving smoothly and confidently across the page. The camera tilts slowly to reveal her concentrated expression and the intricate lines of her drawing. Soft, ambient lighting creates a calm and creative atmosphere." + }, + { + "prompt_en": "A person is petting animal (not cat)", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman, with gentle features and long, flowing hair, is kneeling down beside a fluffy golden retriever. She is wearing a light, floral dress and smiling warmly as she pets the dog's head. The dog looks content and relaxed, sitting obediently on a patch of vibrant green grass. The woman's hand moves gently over the dog's fur, creating a moment of calm and affection. Sunlight filters through the leaves of nearby trees, casting playful shadows on the ground. The scene captures a serene and heartwarming interaction between the woman and the dog." + }, + { + "prompt_en": "A person is eating watermelon", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is sitting at a table, enjoying a slice of juicy watermelon. She has long, flowing hair and is wearing a colorful summer dress that complements the bright, sunlit atmosphere around her. The woman holds the wedge with both hands, taking a bite, as the watermelon juice glistens in the sunlight. Her expression is one of contentment and pleasure as she savors the refreshing treat. The camera gently zooms in to capture her delighted face and the vibrant colors of the watermelon." + }, + { + "prompt_en": "A person is playing harp", + "dimension": [ + "human_action" + ], + "refined_prompt": "A beautiful young woman is gracefully playing the harp, positioned in an elegantly decorated room. She has long, flowing hair and is wearing a delicate, flowing white dress. Her fingers gently pluck the strings of the harp, creating a melodious tune. The room is softly lit by a warm, ambient glow, enhancing the serene and harmonious atmosphere. The camera captures her from a side angle, slowly panning to showcase her focused expression and skillful hand movements across the strings." + }, + { + "prompt_en": "A person is wrestling", + "dimension": [ + "human_action" + ], + "refined_prompt": "A muscular man is engaged in a wrestling match on a padded mat. He has short hair and is wearing a tight-fitting athletic singlet. His face shows intense focus and determination as he grapples with an unseen opponent. His movements are powerful and controlled, as he leans forward, muscles tensed, in a strong defensive position. The lighting casts shadows highlighting his muscular physique. The camera captures the scene at a slight angle, providing a dynamic view of the action." + }, + { + "prompt_en": "A person is riding scooter", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man is riding a scooter down a smooth, paved path. His stylish appearance includes a casual t-shirt and jeans, with a pair of sunglasses resting on his face, giving him a cool and confident look. The scooter speeds along smoothly, and the man maintains an upright posture as he navigates. Sunlight creates a bright, cheerful atmosphere, casting shadows on the path as the scene progresses. The camera follows him from a side angle, capturing his motion against a backdrop of green trees and a clear blue sky." + }, + { + "prompt_en": "A person is sweeping floor", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is sweeping a spacious wooden floor in a well-lit room, with sunlight streaming in through large windows. He is wearing a casual blue t-shirt and comfortable jeans, with a focused expression as he sweeps dust and debris into a neat pile. The man moves steadily across the floor, his posture straight and purposeful. The gentle glint of sunlight reflects off the polished wood, adding a sense of warmth to the atmosphere." + }, + { + "prompt_en": "A person is skateboarding", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man is skateboarding on a smooth, sunlit pavement. Dressed in casual attire, he wears a black t-shirt and denim shorts, exuding a relaxed and confident vibe. His skateboard glides effortlessly across the ground as he expertly maneuvers, occasionally performing small tricks. The sun casts a warm light over the scene, highlighting his focused expression and the fluid motion of his ride. The camera follows him smoothly, capturing the sense of freedom and skill in his swift movements." + }, + { + "prompt_en": "A person is dunking basketball", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man, athletic and with well-defined features, is seen soaring through the air towards a basketball hoop. Wearing a sleeveless sports jersey and shorts, his muscular arms are extended as he grips a basketball firmly. His expression is focused and intense as he prepares to slam the ball through the hoop. The shot captures him from a side angle, emphasizing his leap and power. The gym setting is bright, with overhead lights highlighting the action and casting shadows on the polished wooden floor below." + }, + { + "prompt_en": "A person is playing flute", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is playing the flute with a warm expression on her face. She has long, flowing hair and is wearing a light, elegant dress that complements her graceful presence. The setting is softly lit, with a gentle, ambient glow highlighting her features. The woman stands in a relaxed posture, holding the flute delicately in her hands. The camera captures her from a slight upward angle, emphasizing her poised and charming demeanor as she performs, allowing a glimpse of the serene background behind her." + }, + { + "prompt_en": "A person is stretching leg", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman stands gracefully beside a serene yoga mat on a wooden floor, gently stretching her leg in a classic yoga pose. Her body is positioned at an angle, with one leg extended behind her, and her arms outstretched for balance. She is wearing a fitted yoga outfit, and her hair is neatly tied back, revealing her serene expression of concentration and calm. The natural light from a nearby window casts a soft glow across the scene, adding to the peaceful atmosphere. The camera slowly moves around her, capturing the fluidity and elegance of her pose." + }, + { + "prompt_en": "A person is tying tie", + "dimension": [ + "human_action" + ], + "refined_prompt": "A handsome man is standing in a well-lit room, looking into a mirror as he ties a tie around his neck. He has short, neat hair and is wearing a crisp white dress shirt. His expression is focused yet calm as he carefully loops and knots the tie. The man's swift, practiced movements reflect his confidence. The room is elegantly furnished, with a classic, sophisticated ambiance and a soft, warm light casting gentle shadows across the scene. The camera steadily zooms in to capture the man's precise hand movements before settling on a close-up of the perfectly tied knot." + }, + { + "prompt_en": "A person is skydiving", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person, involved in a thrilling skydiving adventure, is captured in mid-air with an expansive view of the earth beneath. The individual is wearing a sleek, aerodynamic jumpsuit and a helmet with a clear visor. Their body is perfectly positioned in a freefall form, arms spread wide, and legs slightly bent for stability. Below, the landscape unfolds with patches of green fields and winding rivers, creating a beautiful contrast against the blue sky and fluffy white clouds. The camera gently follows, capturing the exhilarating sense of freedom and adventure." + }, + { + "prompt_en": "A person is shooting goal (soccer)", + "dimension": [ + "human_action" + ], + "refined_prompt": "A male soccer player, exhibiting athleticism and focus, prepares to shoot a goal on an open soccer field. He is dressed in a fitted, vibrant jersey and shorts, with cleats gripping the grass beneath him. As he approaches the ball, he demonstrates precise control, his posture showing determination and grace. The camera smoothly follows his movement from behind as he strikes the ball with power and accuracy toward the goal. The scene is set under clear, bright daylight, enhancing the vivid colors of the field and his athletic attire." + }, + { + "prompt_en": "A person is playing piano", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is gracefully playing the piano, her fingers delicately moving over the keys. She is seated on a bench, wearing an elegant, flowing dress that complements her charming beauty. Her long hair cascades over her shoulders as she leans forward slightly, focused intently on the music. The ambient lighting creates a warm and intimate atmosphere, casting soft shadows around her. The camera gently zooms in to capture the expressive movement of her hands as they glide over the black and white keys." + }, + { + "prompt_en": "A person is finger snapping", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman, with long flowing hair and a bright smile, stands against a softly lit background. She is stylishly dressed in a casual, fitted shirt. She extends her hand towards the camera and gracefully snaps her fingers, her wrist and fingers moving fluidly. The sound of the snap resonates with a crisp, clear tone. The gentle lighting highlights her expressive face, capturing her charming and engaging demeanor. The camera slightly zooms in to focus on her hand movement, emphasizing the snap." + }, + { + "prompt_en": "A person is canoeing or kayaking", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is skillfully kayaking across a serene lake, with the morning light casting a soft glow over the water. The individual, who appears to be a young adult, is wearing a vibrant red life jacket and a sun hat. The kayak glides smoothly through the calm water, creating small ripples that expand outward. The person paddles with a steady rhythm, moving from right to left as they progress. The backdrop features lush greenery along the shoreline, enhancing the tranquility of the scene. The camera follows alongside the kayak, capturing the gentle motion and the peaceful environment." + }, + { + "prompt_en": "A person is laughing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is seated in an outdoor café, radiating charm and beauty as she laughs joyfully. Her long, flowing hair shimmers in the sunlight, and she has an infectious smile that lights up her face. She is wearing a soft, pastel blouse and casual jeans. Her eyes sparkle with delight as she looks slightly away, with her hand gently touching her cheek. The warm, golden light of the late afternoon enhances the cozy ambiance of the scene, and the camera subtly zooms in to capture her exuberant expression." + }, + { + "prompt_en": "A person is digging", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young, athletic man is digging in a lush garden, surrounded by vibrant greenery. He is wearing a fitted t-shirt and jeans, with well-defined muscles visible as he firmly grips a shovel. His expression is focused and determined as he lifts a chunk of soil and places it aside. The sunlight filters through the leaves, casting a dappled pattern on the ground. The camera smoothly pans around him, capturing the energy and dedication in his movements as he continues to dig." + }, + { + "prompt_en": "A person is clay pottery making", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is skillfully making clay pottery in a serene workshop. She is seated at a pottery wheel, wearing a simple apron over her clothes, and she has her hair neatly tied back. Her hands are gently shaping a lump of clay as the wheel spins, forming it into a vessel. The soft light in the room highlights her focused expression and the texture of the clay. Shelves filled with completed pottery pieces line the background, adding to the artistic atmosphere. The camera slowly circles around her to capture different angles of her careful craftsmanship." + }, + { + "prompt_en": "A person is shooting basketball", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is shooting a basketball on an outdoor court under a clear blue sky. He stands at the free-throw line, focusing intensely on the hoop. The basketball court is surrounded by trees, creating a serene backdrop. The man, wearing a sleeveless athletic shirt and shorts, bends his knees slightly for balance, and, with a rhythmic motion, he lifts the ball and releases it with a perfect arc towards the hoop. The camera follows the ball as it sails through the air, capturing the moment of its descent into the hoop." + }, + { + "prompt_en": "A person is bending back", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is gracefully bending backwards, performing a deep backbend on a grassy field. She is wearing a form-fitting athletic outfit that accentuates her curves and flexibility. Her long hair cascades down towards the ground, creating a captivating silhouette against a soft evening light. The camera slowly circles around her, providing a dynamic view of her form and the serene nature setting. The atmosphere is tranquil, with a gentle breeze rustling the surrounding grass." + }, + { + "prompt_en": "A person is shaking hands", + "dimension": [ + "human_action" + ], + "refined_prompt": "In an office setting with soft, natural lighting, a man and a woman are shaking hands. The man is handsome, with neatly styled hair, a warm smile, and is dressed in a crisp white shirt and a navy-blue suit. The woman is charming and beautiful, with long, flowing hair and a gentle expression, wearing a light blouse and a tailored blazer. The handshake is firm and confident, symbolizing agreement or greeting. The camera captures this moment with a smooth zoom-in, focusing on the hands and then retreating slightly to reveal their friendly expressions." + }, + { + "prompt_en": "A person is bandaging", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is carefully bandaging her left wrist while sitting in a small, well-lit room. She is wearing a comfortable T-shirt and jeans, and her hair is loosely tied back. Her expression is focused and attentive as she wraps a white bandage snugly around her wrist. The room is peaceful and organized, with soft sunlight streaming in through a nearby window. The camera captures this moment from a slightly elevated angle, gently zooming in to highlight the woman's concentration and the details of her bandaging technique." + }, + { + "prompt_en": "A person is push up", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is performing push-ups on a grassy outdoor field under the clear blue sky. He is muscular and fit, wearing a sleeveless athletic top and shorts. His arms are positioned shoulder-width apart, and he maintains excellent form as he lowers his body toward the ground and pushes back up. The sun casts a warm light, creating a vibrant and lively atmosphere. The camera smoothly pans from a side view to a frontal angle, capturing the man's determined expression and highlighting his focus and strength." + }, + { + "prompt_en": "A person is catching or throwing frisbee", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young, athletic man is skillfully catching a frisbee on a grassy field. He has a lean and fit appearance, wearing a sleeveless sports shirt and shorts. His eyes are focused on the frisbee as he leaps into the air, arms outstretched to catch it. The motion of his jump is fluid and dynamic, with the camera tilting upwards to follow his ascent. The vibrant green grass and clear blue sky create a lively and energetic atmosphere around him." + }, + { + "prompt_en": "A person is playing trumpet", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is playing a trumpet, holding it confidently up to her lips. She stands with a poised posture, her eyes focused ahead. The warm lighting creates a cozy atmosphere and highlights the golden sheen of the trumpet. She wears an elegant blouse, and her long, wavy hair cascades over her shoulders. The sound of the trumpet resonates as her fingers gracefully move over the valves. The camera gently pans around her, capturing her dedication and the rich sound emanating from the instrument." + }, + { + "prompt_en": "A person is flying kite", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young boy is flying a kite in an open field, with the clear blue sky serving as a beautiful backdrop. He is standing with both hands gripping the string tightly, and he looks up with a joyful expression as the colorful kite dances in the breeze above him. The boy is wearing a bright red t-shirt and jeans, and he stands firmly on the grassy ground as he moves the string slightly to guide the kite's flight. The camera smoothly pans up from the boy to focus on the kite soaring high against the sky." + }, + { + "prompt_en": "A person is filling eyebrows", + "dimension": [ + "human_action" + ], + "refined_prompt": "A beautiful woman is filling in her eyebrows while seated at a vanity. She has long, flowing hair and is wearing a stylish top that complements her poised appearance. She carefully uses an eyebrow pencil, applying small, precise strokes to enhance the shape of her eyebrows. Her expression is focused and gentle, highlighting her delicate features. The soft, ambient lighting creates a warm and intimate atmosphere, and a mirror on the vanity reflects her image as she perfects her look." + }, + { + "prompt_en": "A person is shuffling cards", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is skillfully shuffling a deck of playing cards. He has a focused expression as he deftly interlaces the cards with precision and ease. The motion is smooth and fluid, with the cards softly clattering as they fall into place. He is wearing a fitted, dark shirt, and his hands move nimbly, showcasing his mastery. The lighting casts soft shadows, highlighting the cards and his movements. The camera captures a close-up view, focusing on the man's hands as they expertly manipulate the deck." + }, + { + "prompt_en": "A person is folding clothes", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming woman is folding clothes in a cozy, warmly lit room. She has long, flowing hair and a gentle smile. She is wearing a soft, pastel-colored sweater. The woman carefully folds a variety of garments, including shirts and pants, with graceful precision. The neatly folded clothes are placed in a stack on a nearby table. The camera captures her actions in a smooth panning motion, enhancing the serene and organized atmosphere around her." + }, + { + "prompt_en": "A person is smoking", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man in his mid-30s is standing casually against a brick wall, smoking a cigarette. He has short dark hair and is wearing a fitted leather jacket and jeans. The man takes a slow drag from the cigarette and exhales the smoke, which disperses into the air around him. His expression is relaxed, and the shadows from the overhead lighting create a moody atmosphere, adding depth to the scene. The camera slowly pans from his face to his hand holding the cigarette, capturing the smoke's delicate patterns as it rises." + }, + { + "prompt_en": "A person is tai chi", + "dimension": [ + "human_action" + ], + "refined_prompt": "A graceful woman is practicing tai chi in a serene garden setting. Dressed in a flowing white outfit that moves softly with her graceful motions, she performs fluid, slow movements with focused precision. Her long hair is elegantly tied back, allowing her calm and serene expression to be clearly seen. The garden is lush with greenery, providing a peaceful atmosphere as she shifts her weight gently from one foot to the other. The lighting is soft, enhancing the tranquil atmosphere as the camera gently pans around her, capturing the elegance of her practice." + }, + { + "prompt_en": "A person is squat", + "dimension": [ + "human_action" + ], + "refined_prompt": "In a brightly lit gym setting, a muscular man is performing a squat exercise with a barbell resting on his broad shoulders. He wears a fitted sleeveless top and gym shorts, highlighting his toned physique. The man squats with perfect form, lowering his body steadily while keeping his gaze forward, focused and determined. The gym background is filled with other equipment, and the floor is covered with protective mats. The camera captures this scene from a low angle, emphasizing the man's strength and balance as he moves dynamically." + }, + { + "prompt_en": "A person is playing controller", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man is intensely focused on playing a video game, gripping a sleek, modern gaming controller in his hands. He sits comfortably on a plush sofa, dressed casually in a fitted t-shirt and jeans. The expression on his face is one of concentration and determination, with his eyebrows slightly furrowed. Warm ambient lighting creates a cozy atmosphere in the room, with the soft glow reflecting off his face as he interacts with the game. The camera holds a steady close-up on the man's face and hands, capturing the intensity of his gaming experience." + }, + { + "prompt_en": "A person is throwing axe", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is standing a few yards away from a wooden target, preparing to throw an axe with focus and precision. He is wearing a fitted short-sleeve t-shirt and jeans, and his muscular build is evident. With a concentrated expression, he raises the axe behind his head with both hands, steps forward, and powerfully releases it towards the target in a smooth motion. The axe spins gracefully through the air. The camera pans smoothly to track the axe’s trajectory as it flies towards the target. The setting is outdoors, with trees and sunlight filtering through the branches, adding an invigorating atmosphere to the scene." + }, + { + "prompt_en": "A person is giving or receiving award", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming woman is standing on a stage, holding a glossy trophy as she receives an award. She is elegantly dressed in a stunning evening gown that shimmers under the bright stage lights. Her long, flowing hair cascades down her shoulders, and her face is lit up with a joyful smile. The background features a colorful backdrop with a soft spotlight focused on her. The camera captures this uplifting moment from a front-facing angle, zooming in slightly to emphasize her expression of happiness and pride." + }, + { + "prompt_en": "A person is air drumming", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man is enthusiastically air drumming, his movements animated and full of rhythm. He is standing in a living room with comfortable furniture in the background. He has short, neatly styled hair and is dressed casually in a fitted t-shirt and jeans. His facial expression is one of pure joy and concentration, as if he can hear the drumbeat in his mind. The camera slowly circles around him, capturing different angles of his energetic performance. The warm lighting in the room creates an inviting and lively atmosphere, emphasizing his dynamic movements." + }, + { + "prompt_en": "A person is taking a shower", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is taking a shower in a modern bathroom. The scene captures the upper body of a man with a well-toned physique, under a rain showerhead. Water cascades down over his head and shoulders, creating a relaxing and refreshing ambiance. The bathroom features sleek, tiled walls and a glass shower enclosure, with steam slightly fogging the glass. Soft, diffused lighting adds a warm and calming atmosphere to the setting. The camera remains steady, focusing on the soothing flow of water." + }, + { + "prompt_en": "A person is planting trees", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man, with a fit and handsome appearance, is carefully planting a young tree sapling in a grassy field. He is kneeling on the ground, wearing a casual t-shirt and jeans, as he gently places the sapling in a small hole in the soil. His expression is focused and content. The camera captures the scene from a low angle, emphasizing the green foliage around him and bringing the bright blue sky in the background into view. The atmosphere is peaceful and infused with the warmth of the afternoon sun." + }, + { + "prompt_en": "A person is sharpening knives", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is sharpening a knife in a well-lit kitchen. He is handsome, with short, dark hair and a confident expression. He holds a sharpening steel in one hand and a knife in the other, carefully drawing the blade across the steel at a precise angle. The man wears a crisp, white shirt, and his posture is focused yet relaxed. The kitchen background includes wooden countertops and a few colorful vegetables on a cutting board. As the knife glides smoothly over the steel, the sound of metal on metal adds a rhythmic quality to the scene. The camera captures this action from a close-up angle, emphasizing the sharpness of the blade and the man's skillful technique." + }, + { + "prompt_en": "A person is robot dancing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young man is performing a robot dance in a brightly lit room with a modern and minimalistic design. He is stylishly dressed in a fitted black t-shirt and dark jeans, and he exudes charm with each precise and mechanical movement. His expressions are serious and focused as he executes sharp, controlled robotic motions, his arms and legs moving in angular patterns. The camera smoothly circles around him, capturing his dance from various angles and emphasizing the fluidity and precision of his movements." + }, + { + "prompt_en": "A person is rock climbing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A determined young man is rock climbing on a rugged cliff face, focusing intently as he navigates upwards. He is wearing a snug-fitting blue t-shirt and durable climbing pants, with a safety harness secured around his waist. His muscular arms and strong grip are evident as he reaches for a higher handhold. The sunlight casts a dramatic shadow on the cliff, enhancing the rugged texture of the rock. The camera captures a close-up of his concentrated expression, moving slightly to follow his upward progress." + }, + { + "prompt_en": "A person is hula hooping", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman with long flowing hair hula hoops gracefully in a sunlit room. She wears a brightly colored tank top and fitted leggings. Her expression is one of joy and concentration as she moves the hula hoop rhythmically around her waist. The sunlight streaming through a window casts a warm glow on her and the surrounding room. The camera captures her in a steady, frontal view, highlighting her fluid movements and the vibrant colors of her outfit." + }, + { + "prompt_en": "A person is writing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A close-up view reveals a person writing in a notebook on a wooden desk. The person has slender hands and is using a sleek, black pen to carefully jot down notes. The notebook's pages are crisp white, providing a clean and organized backdrop for the writing. Soft, warm light illuminates the scene, creating a cozy and focused atmosphere. The camera subtly shifts to capture the fluid motion of the pen as it glides across the paper." + }, + { + "prompt_en": "A person is bungee jumping", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is captured mid-bungee jump, with their body gracefully arched as they plummet towards the ground. The person is wearing a safety harness and a classic bungee jumping outfit, their hair and clothing fluttering with the rush of air. The bungee cord extends upwards, emphasizing the thrilling descent. The backdrop features a dramatic view of the lush valley far below, with the sky a clear blue. The camera follows the person's downward motion, enhancing the exhilarating feeling of free fall." + }, + { + "prompt_en": "A person is pushing cart", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is pushing a shopping cart down an aisle in a brightly lit grocery store. He is athletic and handsome, sporting casual attire with a snug-fitting t-shirt and jeans. His expression is focused, and his posture is upright and confident as he moves the cart forward, with the camera capturing him from the side. The cart is empty except for one or two items, and the shelves in the background are stocked with various colorful products, creating a bustling and lively shopping atmosphere." + }, + { + "prompt_en": "A person is cleaning windows", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman with long, flowing hair is cleaning windows with a determined expression. She wears a fitted blue top and black pants, exuding poise and charm. Using a spray bottle filled with a cleaning solution, she sprays the window and then gracefully wipes it with a cloth in circular motions. Sunlight streams in through the window, creating a bright and cheerful ambiance. The camera moves slightly from left to right to capture the clarity and shine of the freshly cleaned window." + }, + { + "prompt_en": "A person is cutting watermelon", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is cutting a watermelon on a wooden cutting board. The person is wearing a white shirt and is carefully slicing through the watermelon with a large knife. As the knife cuts through, the juicy red interior is revealed, with the light reflecting off the fresh surface. The table is in a well-lit kitchen environment, with stainless steel appliances and a bowl of colorful fruits visible in the background." + }, + { + "prompt_en": "A person is cheerleading", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is cheerleading on a grassy field, exuding energy and enthusiasm. Dressed in a vibrant cheerleading outfit with a short pleated skirt and matching top, she holds colorful pom-poms in each hand. Her long hair is styled in a high ponytail, and she wears a bright smile as she performs her routine. The woman executes a high jump with precision, her pom-poms raised high as her knees bend slightly during the leap. The afternoon sun bathes the scene in warm light, capturing her grace and athleticism. The camera follows her movement, emphasizing the height and energy of her jump." + }, + { + "prompt_en": "A person is washing hands", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is washing hands under a shiny chrome faucet in a clean, modern bathroom. The water flows steadily over their hands, creating a gentle splashing sound. The person applies soap, creating a rich lather as they rub their hands together, ensuring thorough cleaning. The bright lighting reflects off the faucet and the white ceramic sink, highlighting the cleanliness and freshness of the setting. The camera captures this from a close angle, focusing on the movement of the hands and the flowing water." + }, + { + "prompt_en": "A person is ironing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A beautiful woman stands poised and attentive, ironing a crisp white shirt on a neatly arranged ironing board. She has long, flowing hair and is dressed in a soft pastel blouse, exuding elegance and charm. Her stance is graceful as she moves the iron smoothly over the shirt's fabric, creating a gentle steam that rises into the air. The room is well-lit, with warm lighting that enhances the serene and tidy atmosphere. The camera pans slightly to follow her precise movements, focusing on the meticulous care she takes with each pass of the iron." + }, + { + "prompt_en": "A person is cutting nails", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is seated comfortably at a table, gently cutting her nails with a small nail clipper. Her long, silky hair cascades over her shoulders, and she is focused on her task with a serene expression. She's wearing a soft, pastel-colored blouse. The camera captures her from a slightly elevated angle, showing her graceful hand movements as she carefully trims each nail. The lighting is soft and natural, creating a calm and intimate atmosphere. A small bottle of nail polish and a file rest on the table beside her." + }, + { + "prompt_en": "A person is hugging", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman with long, flowing hair and wearing a floral dress is tenderly hugging a young child. The child, with short curly hair and wearing a bright yellow T-shirt, wraps their small arms around the woman's neck. The woman's expression is gentle and nurturing, and she closes her eyes as she embraces the child. The background is a softly lit, peaceful room, adding warmth and coziness to the tender moment. The camera captures the scene with a slow zoom-in, highlighting the emotional connection between the two." + }, + { + "prompt_en": "A person is trimming or shaving beard", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is standing in front of a bathroom mirror, focused and carefully trimming his beard with a sleek electric trimmer. He has a well-groomed appearance, with short hair and a stylish beard. He is wearing a crisp white t-shirt, and his expression is concentrated as he ensures the beard is even and neatly shaped. The bathroom is well-lit, providing a clear and bright view of the man and his grooming routine. The camera captures a close-up of his hand movements and the precise control he uses with the trimmer." + }, + { + "prompt_en": "A person is jogging", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is jogging along a scenic forest trail, surrounded by tall, lush green trees. The sunlight filters through the leaves, casting dappled shadows on the path. He is wearing a fitted athletic shirt and shorts, with running shoes that match his outfit. His expression is focused as he moves steadily forward, and his hair bounces slightly with each step. The camera smoothly tracks him from the side, capturing the serene and refreshing atmosphere of the natural setting." + }, + { + "prompt_en": "A person is making bed", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is making a bed in a softly lit bedroom. She appears to be in her late 20s, with long, flowing hair and wearing a stylish, comfortable pajama set. Her movements are deliberate and graceful as she smooths the wrinkles out of a white bedspread. The room has a calming atmosphere, with sunlight streaming through a window and casting a gentle, natural light across the neatly arranged pillows and the artistically arranged bedspread. The camera captures her actions from a slightly elevated angle, providing a clear view of her expert bed-making." + }, + { + "prompt_en": "A person is washing dishes", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is standing at a kitchen sink, washing dishes with precision and care. She is wearing a cheerful yellow blouse, her sleeves rolled up to prevent them from getting wet. Her hands skillfully maneuver a soapy sponge over a plate, creating suds that glisten in the soft light above the sink. The kitchen is warm and inviting, with wooden cabinets and a vase of fresh flowers on the countertop. Water flows gently from the faucet, rinsing the soap away as the woman places the cleaned plate into a drying rack nearby. The camera smoothly pans to encircle the woman, capturing her focus and the cozy kitchen atmosphere." + }, + { + "prompt_en": "A person is grooming dog", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is grooming a fluffy, white dog on a grooming table. She is wearing a casual, light pink blouse, and her hair is neatly tied back. The dog sits patiently while she carefully brushes its coat, creating a serene and focused atmosphere. The woman's expression is gentle and caring as she tends to the dog's fur. The grooming area is well-lit, with neat rows of grooming tools visible in the background. The camera smoothly pans from left to right, capturing the woman and dog in a harmonious grooming routine." + }, + { + "prompt_en": "A person is doing laundry", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is doing laundry in a bright and tidy laundry room. The room is well-lit with natural light streaming through a window. She is standing in front of a modern washing machine, placing a pile of colorful clothes inside. The woman has long, flowing hair and is wearing a comfortable, casual top and jeans. She looks content and focused as she efficiently loads the machine. Next to her, there's a neatly arranged shelf with detergents and fabric softeners. The camera pans slightly to capture the organized and clean environment." + }, + { + "prompt_en": "A person is knitting", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming young woman is seated comfortably in a cozy room, knitting diligently. She is wearing a soft, pastel-colored sweater, which complements her serene expression. Her long hair cascades down her shoulders as she focuses intently on her crafting. The knitting needles glide rhythmically through the brightly colored yarn, forming neat rows of stitches. A soft light filters through a nearby window, creating a peaceful and warm atmosphere. The camera gently pans from her focused face to the knitting project in her hands, capturing the intricate pattern taking shape." + }, + { + "prompt_en": "A person is reading book", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming woman is sitting comfortably in a cozy room, reading a book. She has long, flowing hair and is wearing a soft, pastel-colored sweater. The warm lighting creates a calm and inviting atmosphere around her. As she flips a page, her expression is thoughtful and serene, indicating her deep engagement with the story. The camera captures her from a gentle side angle, highlighting her delicate features and the elegant way she holds the book." + }, + { + "prompt_en": "A person is baby waking up", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming baby girl is slowly waking up in her crib. She has soft, curly hair and bright, curious eyes. The baby is wearing an adorable pastel-colored onesie, with playful patterns on it. Her expression transitions from sleepy to an intrigued gaze as her eyes flutter open, revealing her wonder at the world around her. The warm morning sunlight gently filters through the curtains, casting a soft glow over the room and creating a cozy, inviting atmosphere. The camera captures the scene with a slight tilt, enhancing the gentle awakening and curiosity of the baby." + }, + { + "prompt_en": "A person is massaging legs", + "dimension": [ + "human_action" + ], + "refined_prompt": "A beautiful woman is seated comfortably on a plush sofa in a softly lit room, creating a warm and relaxing atmosphere. She is gently massaging her own legs, using both hands to apply soothing pressure along her calf muscles. The scene focuses on her graceful and gentle movements, capturing a sense of calm and tranquility. The camera slowly zooms in to highlight the woman's serene expression and the careful, rhythmic motion of her hands. Her attire is casual yet elegant, complementing her natural beauty." + }, + { + "prompt_en": "A person is brushing teeth", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman with long, wavy hair stands in front of a bathroom mirror, brushing her teeth. She wears a cozy, pastel-colored bathrobe and appears focused on her reflection. The toothbrush she uses is modern and electric, buzzing as she moves it in gentle circular motions across her teeth. The bathroom is softly lit, creating a warm and relaxing ambiance. The woman’s reflection in the mirror adds depth to the scene, and the sound of running water can be faintly heard in the background. The camera smoothly pans over her shoulder to capture her attentive expression in the mirror." + }, + { + "prompt_en": "A person is crawling baby", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming baby, with soft, curly hair and dressed in a cute pastel-colored onesie, is crawling energetically on a soft, plush beige carpet. The baby moves towards the camera with bright, curious eyes and a joyful expression, with tiny hands and knees pressing into the carpet. The scene is warmly lit, creating a cozy and inviting atmosphere. A gentle camera movement follows the baby's progression, capturing the delightful moment as the baby giggles along the way." + }, + { + "prompt_en": "A person is motorcycling", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is motorcycling down a scenic country road on a bright, sunny day. He is wearing a sleek black leather jacket and a matching helmet, adding to his stylish appearance. The motorcycle glistens in the sunlight as he expertly maneuvers along the winding road. Trees with lush green leaves line the sides of the road, and their shadows occasionally play across the path. The camera follows closely behind, capturing the man's confident posture and the smooth motion of the ride." + }, + { + "prompt_en": "A person is driving car", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is driving a sleek car on a scenic road surrounded by lush green trees. With a focused expression, he grips the steering wheel, wearing a stylish black leather jacket that complements the car's modern interior. The sunlight filters through the leaves, creating a dappled pattern on the windshield. The camera captures a side angle of the man through the open window as the car smoothly glides along the winding road, with the gentle breeze ruffling his hair." + }, + { + "prompt_en": "A person is sticking tongue out", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman with a playful expression is sticking her tongue out in a lighthearted manner. She has long, wavy hair cascading over her shoulders and is wearing a casual, colorful top. Her bright eyes and slightly raised eyebrows add to her cheerful demeanor. The background is softly blurred, emphasizing her face and playful gesture. The camera captures her from the shoulders up, with a gentle zoom-in to highlight her expression." + }, + { + "prompt_en": "A person is shaking head", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman with long, flowing hair is shaking her head gently from side to side. Her beautiful locks cascade around her shoulders as she moves. She wears a stylish blouse with intricate patterns and has a serene expression on her face. The lighting is soft and warm, illuminating her graceful features and adding a gentle glow to her hair. The camera remains steady, capturing the fluid motion of her hair and the calm atmosphere around her." + }, + { + "prompt_en": "A person is sword fighting", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is engaging in a dynamic sword fight, skillfully wielding a sleek sword. He is wearing a dark, fitted outfit that accentuates his athletic build. With intense focus, he executes a series of swift and precise moves, parrying and attacking with fluidity. His expression is concentrated and determined as he maneuvers, moving gracefully across a spacious training area. The lighting highlights his movements, creating dramatic shadows that enhance the intensity of the action. The camera smoothly follows his motions, capturing the energy of the sword fight from various angles." + }, + { + "prompt_en": "A person is doing aerobics", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is doing aerobics in a brightly lit room with large windows. The woman, with long, flowing hair tied in a ponytail, is wearing a fitted sports tank top and leggings. She energetically moves to the rhythm, raising her arms overhead and stepping side to side. Her expression is focused and lively, capturing the joy of movement. As she performs her routine, the camera smoothly pans around her, showcasing the fluidity and grace of her aerobic workout. The sunlight filters in through the windows, creating a vibrant and energetic atmosphere." + }, + { + "prompt_en": "A person is strumming guitar", + "dimension": [ + "human_action" + ], + "refined_prompt": "A charming woman is sitting comfortably on a chair, strumming a guitar with a gentle smile on her face. Her long, wavy hair cascades over her shoulders, and she is dressed in a casual, yet stylish outfit, wearing a flowy blouse and jeans. The guitar, with its polished wooden body, rests on her lap as she gracefully moves her fingers along the strings. Her posture is relaxed, and her expression exudes joy and concentration. The warm lighting in the room creates a cozy and inviting atmosphere, with soft shadows accentuating her graceful movements. The camera slowly zooms in to capture the details of her strumming technique." + }, + { + "prompt_en": "A person is riding or walking with horse", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person is gently leading a horse through a lush green meadow. The person, a handsome man, is dressed in a casual plaid shirt and jeans, exuding a relaxed yet confident demeanor. The horse, with a glossy chestnut coat, walks alongside him, its mane flowing with each step. The man holds the horse's reins loosely, guiding it with ease while both take measured, synchronized steps. The tall grass sways gracefully with a light breeze, and the sun casts a warm, golden glow over the serene landscape. The camera follows the pair smoothly from a side angle, capturing their peaceful interaction and the natural beauty of their surroundings." + }, + { + "prompt_en": "A person is archery", + "dimension": [ + "human_action" + ], + "refined_prompt": "A handsome young man is practicing archery outdoors, standing with focused intent. He is wearing a fitted, short-sleeved shirt and jeans, with a confident stance. Holding a sleek bow in his left hand, he draws the arrow back with his right, aiming towards an unseen target. The scene is set against a backdrop of lush greenery in a softly sunlit environment. The camera gently pans from right to left, capturing his determined expression and the graceful tension in his posture as he prepares to release the arrow." + }, + { + "prompt_en": "A person is catching or throwing baseball", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young, athletic man is standing on a grassy baseball field under a bright, sunny sky. He wears a baseball cap, a short-sleeved jersey, and comfortable athletic pants. With focused determination, he skillfully throws a baseball towards the camera, his arm extended fully in follow-through. The ball leaves his hand with speed and precision. His stance is solid, knees slightly bent, and his expression is one of concentration. The scene captures the essence of a warm, sunny day at the ballpark, with distant sounds of cheering and a light breeze rustling the nearby trees. The camera zooms in on the man's confident expression as he throws, then follows the trajectory of the baseball." + }, + { + "prompt_en": "A person is playing chess", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman with long, flowing hair is seated at a table, deeply focused on a chessboard in front of her. She is wearing a stylish blouse and has an elegant posture, her fingers delicately poised over a chess piece. The board is set up with an array of pieces, and she is in the middle of contemplating her next move. Her expression is one of concentration and determination. Soft, ambient lighting creates a warm and inviting atmosphere, illuminating the chessboard and highlighting the contours of her face. The camera gradually zooms in on her thoughtful expression as she makes a strategic move on the board." + }, + { + "prompt_en": "A person is rock scissors paper", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young, handsome man is engaging in a game of rock-paper-scissors. He stands with a playful expression, and his hand is held out in front of him, gesturing in the act of playing. The man has short, neatly styled hair and is wearing a casual T-shirt. As the camera zooms in, his hand alternates between the different shapes—rock, paper, and scissors—demonstrating each choice in the game. The lighting is bright and cheerful, creating an upbeat atmosphere." + }, + { + "prompt_en": "A person is using computer", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is sitting at a desk, deeply focused on a computer screen. She has long, flowing hair and wears a stylish blouse with subtle patterns. Her eyes are fixed on the screen as she gently types on the keyboard. The room is warmly lit, casting a soft glow on her face and her surroundings. The desk is organized, with a few office supplies neatly placed around the computer, and a comfortable chair supports her posture. The camera captures a side view of her focused expression and the illuminated screen." + }, + { + "prompt_en": "A person is arranging flowers", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is carefully arranging a bouquet of vibrant flowers on a wooden table. She is gracefully holding a sunflower in one hand, her fingers delicately adjusting its position among roses and daisies. She has flowing, shiny hair and is wearing a light, floral-patterned dress that complements the colors of the flowers. The sun casts a warm glow on her, highlighting her serene and focused expression. The camera subtly pans to capture her skilled hands and the beauty of the completed arrangement, creating a peaceful and artistic atmosphere." + }, + { + "prompt_en": "A person is bending metal", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is bending a piece of metal using a metal bending tool in a workshop. He has a muscular build, and his face is focused and determined as he handles the equipment. The man is wearing a dark, fitted T-shirt that highlights his strong arms, along with safety goggles for protection. The lighting in the workshop is bright, emphasizing the reflections on the metal and the intensity of his task. The camera zooms in slightly as he applies force, showcasing the smooth curve forming in the metal piece." + }, + { + "prompt_en": "A person is ice skating", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is gracefully ice skating on a smooth ice rink. She is dressed warmly in a stylish, vibrant blue coat and matching gloves, with her hair neatly tucked under a cozy knitted hat. Her movements are fluid and elegant as she glides across the ice, performing gentle turns and swirls. The cold air creates a misty atmosphere, and the rink is softly illuminated by overhead lights, enhancing the enchanting scene. The camera follows her from a side view, capturing her serene expression and the rhythmic motion of her skating." + }, + { + "prompt_en": "A person is climbing a rope", + "dimension": [ + "human_action" + ], + "refined_prompt": "A strong, athletic man is climbing a thick rope hanging in a gymnasium. His well-defined muscles flex as he grips the rope tightly and pulls himself upwards. He is wearing a sleeveless workout top and shorts, with determination evident in his focused expression. The gym is brightly lit, and the background shows gym equipment such as weights and exercise machines. The man continues to ascend steadily, his feet occasionally bracing against the rope for support. The camera pans upwards to follow his progress, capturing the intensity of his climb." + }, + { + "prompt_en": "A person is crying", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is sitting on a simple wooden chair, crying softly. Her head is slightly bowed, and tears gently roll down her cheeks. She has long, dark hair that frames her delicate face and partially obscures her downcast eyes. She wears a light, flowing dress with soft pastel colors, adding to her graceful and charming appearance. The room is softly lit, creating a warm and intimate atmosphere. The camera slowly zooms in to capture the subtle expressions of emotion on her face." + }, + { + "prompt_en": "A person is dancing ballet", + "dimension": [ + "human_action" + ], + "refined_prompt": "A graceful ballerina is performing ballet on a polished wooden stage. She is wearing a delicate, flowing white tutu and elegant pointe shoes that accentuate her poised movements. Her hair is styled in a sleek bun, complementing her ethereal appearance. As she dances, she executes a series of pirouettes and arabesques, moving lightly across the stage. Her arms and hands flow gracefully, adding to the fluidity of her performance. The stage lighting focuses on her, casting a gentle spotlight that enhances her enchanting presence as she dances towards the camera. The camera slowly follows her movements, capturing the elegance and beauty of each step." + }, + { + "prompt_en": "A person is getting a haircut", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is seated in a modern, stylish barber chair, getting a haircut. He is young, with neatly styled dark hair and a well-groomed beard. The barber, a well-dressed professional with precise movements, is focused on trimming the man's hair with sharp scissors. The camera captures this scene from the side, showing the dexterous hands of the barber. In the background, a large mirror reflects the action, along with a row of neatly organized hair care products and tools. The atmosphere is calm and professional, with soft lighting adding a warm ambiance to the setting." + }, + { + "prompt_en": "A person is running on treadmill", + "dimension": [ + "human_action" + ], + "refined_prompt": "A fit and athletic man is running on a treadmill indoors. He is wearing a light gray tank top and black athletic shorts, and his hair is neatly styled. His expression shows determination and focus as he maintains a steady pace. The treadmill's digital display shows his speed and time. The room is brightly lit, highlighting his toned physique, and the background features minimalist decor with a large window letting in natural light. The camera is positioned at a slight angle, capturing both the side profile of the man and the movement of his legs as he runs." + }, + { + "prompt_en": "A person is kissing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman with long flowing hair and a serene expression is gently kissing a fluffy, adorable golden retriever puppy. She is wearing a casual, light sweater and has a gentle smile on her lips. The puppy, with its soft fur and big, innocent eyes, looks content and calm in her embrace. The background is softly blurred, allowing the focus to remain on the tender interaction between the woman and the puppy. The camera gently zooms in, capturing the warmth and affection shared between them." + }, + { + "prompt_en": "A person is counting money", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman with long, flowing hair is sitting at a desk, counting a stack of money. She is elegantly dressed in a fitted blouse. Her expression is focused and calm as she carefully flips through the bills with her manicured fingers. The desk is made of polished wood, and soft, natural light filters in from a nearby window, creating a warm and inviting atmosphere. The camera gently zooms in to capture the details of her focused demeanor and the neatly arranged bills." + }, + { + "prompt_en": "A person is barbequing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A man is barbequing in a backyard setting during the early evening. He is standing close to a large, modern grill, which is radiating soft, warm light from the coals inside. The man is handsome, with short, well-groomed hair, and he is wearing a casual short-sleeved shirt and jeans. Holding a pair of tongs in his hand, he attentively turns over juicy steaks that sizzle on the grill. The camera captures a clear, steady view from the side, accentuating the mouthwatering aroma rising through the air, as the glow from the grill reflects off his face." + }, + { + "prompt_en": "A person is peeling apples", + "dimension": [ + "human_action" + ], + "refined_prompt": "A woman is sitting at a kitchen table, focused on peeling apples with precision. She gently holds a bright red apple in one hand and skillfully maneuvers a small paring knife in the other, creating long, curling peels that fall gracefully onto the table. She is wearing a cozy knitted sweater, and the soft kitchen lighting casts a warm glow on her surroundings. A bowl filled with freshly peeled apples is placed nearby, ready for use, while a few unpeeled apples await their turn. The camera slowly zooms in to capture the woman's concentrated expression and the delicate motion of her hands." + }, + { + "prompt_en": "A person is milking cow", + "dimension": [ + "human_action" + ], + "refined_prompt": "A middle-aged man is milking a cow in a rustic barn setting. He has short, dark hair and a friendly expression as he focuses on his task. He wears a plaid shirt and jeans, sitting on a low stool beside the cow. The cow is calm, with its tail occasionally flicking. The man gently squeezes the udder, and the milk flows steadily into a metal bucket placed on the straw-covered floor. Soft light filters through the wooden slats of the barn, creating a serene and warm atmosphere. The camera captures the scene with a slow, steady pan, highlighting the man's attentive and experienced movements." + }, + { + "prompt_en": "A person is shining shoes", + "dimension": [ + "human_action" + ], + "refined_prompt": "A person, a beautiful young woman with long, flowing hair, is meticulously polishing a pair of black leather shoes. She is focused intently on her task with a gentle smile on her face. She is wearing a casual white shirt with rolled-up sleeves and blue jeans. The shoes are placed on a wooden surface, and she uses a soft cloth to apply the polish, creating a smooth and glossy finish. The setting is warmly lit, emphasizing the care and attention she puts into her work. The camera slowly zooms in, capturing the detailed movements of her hands and the sheen of the polished shoes." + }, + { + "prompt_en": "A person is making snowman", + "dimension": [ + "human_action" + ], + "refined_prompt": "A young woman is building a snowman in a snow-covered park, surrounded by tall, frosted trees. She has a charming smile, her long, dark hair softly catching the winter sunlight. Dressed in a cozy red coat and a knitted white scarf, she carefully places a carrot as the snowman's nose. The snowman, already sporting a hat and scarf, stands in front of her. The camera gently circles around, capturing her joyful expression and the serene, wintry atmosphere." + }, + { + "prompt_en": "A person is sailing", + "dimension": [ + "human_action" + ], + "refined_prompt": "A handsome man is skillfully sailing a small sailboat on a tranquil lake. The sun casts a bright and warm glow over the water, creating a serene and picturesque scene. He is wearing a casual white shirt and khaki shorts, and his short hair is slightly tousled by the gentle breeze. With focused determination, he adjusts the sail, his eyes fixed on the horizon as the boat glides smoothly across the shimmering surface of the lake. The camera pans smoothly alongside the boat, capturing the elegance of the sailboat set against the peaceful expanse of water." + }, + { + "prompt_en": "a person swimming in ocean", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A woman is gracefully swimming in the ocean, her long hair flowing behind her as she moves through the water. The sunlight sparkles on the ocean surface, creating a shimmering effect. She wears a bright-colored swimsuit that stands out against the blue of the water. Her movements are smooth and fluid, with her arms cutting through the waves as she swims towards the horizon. The camera follows her from a low angle, capturing the gentle rise and fall of the ocean around her, enhancing the serene and tranquil atmosphere of the scene." + }, + { + "prompt_en": "a person giving a presentation to a room full of colleagues", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A confident man, appearing to be in his early thirties, stands at the front of a modern conference room, delivering a presentation to a group of attentive colleagues seated around a large table. He is wearing a crisp white dress shirt and tailored gray trousers, exuding professionalism and poise. His hair is neatly styled, and he gestures smoothly with his hands to emphasize his points, creating a dynamic and engaging scene. The colleagues, consisting of both men and women, are focused on him, some taking notes. The room is well-lit, with a large digital screen behind him displaying key points of his presentation. The camera pans slowly across the room, capturing the interaction and the attentive atmosphere." + }, + { + "prompt_en": "a person washing the dishes", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A charming woman is washing dishes at a kitchen sink. She is in her late twenties, with long, wavy brown hair tied back, and wearing a light blue blouse with rolled-up sleeves. Her expression is content and relaxed as she focuses on her task. The kitchen is bathed in warm, natural light coming through a nearby window, adding a cozy atmosphere to the scene. The woman stands slightly angled towards the camera, with her hands immersed in soapy water, gently scrubbing a plate. The camera slowly zooms in to capture the details of her actions, highlighting the splashes of water and bubbles as she washes." + }, + { + "prompt_en": "a person eating a burger", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A man is seated at an outdoor café, casually enjoying a burger. He is in his mid-30s with a neatly trimmed beard and a friendly expression, wearing a stylish navy blue jacket over a casual t-shirt. As he takes a bite, the camera captures the details of the juicy burger, with lettuce and tomato peeking out from the bun. Sunlight filters through the trees around the café, creating a pleasant, relaxed atmosphere. The camera angles slightly upward as it focuses on the man's content expression, emphasizing his enjoyment of the meal." + }, + { + "prompt_en": "a person walking in the snowstorm", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A person, bundled in a heavy winter coat with a hood, is trudging through a snowstorm. Snowflakes swirl around them, blurring the background of snow-covered trees. They move slowly, facing towards the camera, with their head slightly bowed against the biting wind. The atmosphere is cold and wintry, with a sense of perseverance as the person navigates through the thick flurries. The camera provides a steady, close view, emphasizing the intensity of the snowstorm and the determination of the person." + }, + { + "prompt_en": "a person drinking coffee in a cafe", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A woman sits in a cozy cafe near a large window, sipping coffee from a ceramic mug as she gazes outside. She has beautiful features, with long hair cascading over her shoulders. Her outfit is charming, with a light sweater adding to the warmth of the scene. The cafe is softly lit, with ambient light streaming through the window, creating a welcoming atmosphere. As she enjoys her drink, a gentle smile graces her lips, capturing a moment of peaceful reflection. The camera slowly pans from the window to focus on her serene expression as she takes another sip." + }, + { + "prompt_en": "a person playing guitar", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A charismatic young man commands the stage, a whirlwind of motion and sound. He energetically plays an electric guitar, not just standing still, but dancing across the stage with explosive energy that perfectly matches the powerful riffs he unleashes. Bathed in the glow of vibrant, pulsing stage lights, he is the center of attention. He has a captivating presence with stylishly tousled hair and wears a cool leather jacket over a band t-shirt and ripped jeans, embodying a rockstar aesthetic. His fingers dance skillfully and rapidly across the fretboard, producing a soaring, high-energy anthem that electrifies the crowd. The camera work is dynamic, utilizing sweeping wide shots to capture his stage presence and quick cuts to intense close-ups of his passionate expression and the intricate, lightning-fast movements of his fingers on the guitar strings." + }, + { + "prompt_en": "a bicycle leaning against a tree", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek and modern bicycle with a shining metallic frame is leaning against a sturdy tree trunk. The tree stands tall with rich green leaves, casting dappled shadows on the ground around the bicycle. The scene is serene, with soft sunlight filtering through the tree canopy, creating a tranquil and picturesque atmosphere. The camera slowly zooms in on the bicycle, highlighting its polished gears and smooth tires, before panning up to showcase the tree's textured bark and lush leaves." + }, + { + "prompt_en": "a bicycle gliding through a snowy field", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "In a snowy field, a bicycle is seen gliding smoothly across the snow-covered terrain. The bicycle leaves a distinct trail in the snow as it moves, with its wheels creating gentle grooves along the path. The field is blanketed in pristine white snow, casting a serene and quiet atmosphere. The camera follows the bicycle from a low angle, capturing the steady and graceful movement forward across the vast snowy expanse, with distant trees faintly visible in the background." + }, + { + "prompt_en": "a bicycle slowing down to stop", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek bicycle with shiny metallic frames approaches the camera, gradually slowing down. The rider is a young man wearing a fitted black T-shirt and dark jeans. His expression is focused as he gently squeezes the brakes, causing the bicycle's wheels to gradually come to a halt on the pavement. The setting sun casts a golden hue over the scene, highlighting the bicycle's details. The camera follows the bicycle's movement, smoothly panning until the bicycle comes to a complete stop." + }, + { + "prompt_en": "a bicycle accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, modern bicycle is seen from the side, with its shiny metal frame catching the light. The cyclist, a fit man in a streamlined cycling outfit, is hunched over the handlebars, intensely focused on his acceleration. The background features a blurred landscape of trees and sky, emphasizing the speed. The wheels of the bicycle spin rapidly as the man pedals furiously to gain speed, and the wind whips past him, ruffling his clothing and causing the leaves on the trees to sway. The camera smoothly tracks the cyclist, maintaining a side view as he accelerates down a straight path." + }, + { + "prompt_en": "a car stuck in traffic during rush hour", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, silver car is stuck in heavy traffic during rush hour, surrounded by a sea of other vehicles. The car's shiny exterior glistens under the bright city lights, reflecting the urban hustle and bustle. The vehicles around it are at a standstill, with red brake lights illuminating the scene, creating a vibrant, chaotic atmosphere. The camera gradually zooms in on the car, capturing the driver's slightly frustrated expression as they grip the steering wheel, glancing at the static line of cars ahead. This close-up view emphasizes the impatience and restless energy typical of rush hour traffic." + }, + { + "prompt_en": "a car turning a corner", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, silver car gracefully turns a sharp corner on a narrow, winding road. As it maneuvers the curve, its tires grip the asphalt firmly, creating a smooth yet dynamic motion. The car's shiny exterior reflects the surrounding lush greenery, adding a vibrant contrast to the scene. The camera follows the car's movement from behind, capturing the precision and agility of the vehicle as it completes the turn." + }, + { + "prompt_en": "a car slowing down to stop", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, silver car is driving along a tree-lined road, gradually slowing down to a complete stop. The car's headlights reflect off the road as it comes to a halt, with the lush green trees providing a serene backdrop. The camera moves alongside the car, capturing its smooth deceleration as it approaches the camera and stops. The sunlight filters through the trees, casting dappled patterns on the road. The gentle rolling of the car emphasizes the peaceful, calm atmosphere of the scene." + }, + { + "prompt_en": "a car accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek sports car is positioned on an open road, ready to accelerate. As the vehicle begins to move, the camera captures it from a side-angle, highlighting its aerodynamic design under the bright sunlight. The car's tires spin rapidly, kicking up dust as the car gains speed. The camera smoothly pans to follow the car as it accelerates down the empty road, creating a sense of excitement and power. The landscape around them blurs slightly, emphasizing the car's increasing velocity." + }, + { + "prompt_en": "a motorcycle cruising along a coastal highway", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek motorcycle is cruising smoothly along a winding coastal highway, flanked by cliffs on one side and the vast ocean on the other. The motorcycle's gleaming chrome and its rider's black leather jacket catch the sunlight, creating a dynamic and adventurous atmosphere. The rider maintains a relaxed posture as the motorcycle effortlessly navigates the curves of the road. The ocean sparkles under the bright sun, and the sound of the waves can almost be felt as the camera follows the motorcycle's journey along the scenic route." + }, + { + "prompt_en": "a motorcycle turning a corner", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek motorcycle, painted in a shiny red and black combination, is seen gracefully turning a corner on a paved road. The rider, wearing a matching helmet and a black leather jacket, leans into the turn, skillfully maintaining balance and control. The motorcycle tilts slightly, and the rider's focused expression is visible. Sunlight glints off the motorcycle's surface, adding a sense of speed and energy. The camera follows the motorcycle smoothly, capturing the dynamic movement and the sense of adrenaline as it rounds the bend." + }, + { + "prompt_en": "a motorcycle slowing down to stop", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, black motorcycle with shiny chrome accents approaches the camera from a distance along a scenic coastal road. The rider, a man in his mid-thirties wearing a fitted leather jacket, dark jeans, and a black helmet, gradually decelerates. As the motorcycle gets closer, its powerful engine's growl becomes more pronounced. The man slows the motorcycle to a stop, smoothly bringing it to a halt with the backdrop of a tranquil ocean visible in the background. The golden hour lighting casts a warm hue, accentuating the motorcycle's polished metal surfaces. The camera follows the man as he stops, capturing the serene moment." + }, + { + "prompt_en": "a motorcycle gliding through a snowy field", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A motorcycle is gliding effortlessly through a snowy field, leaving a trail in the crisp white snow. The rider, wearing a dark, insulated suit and a helmet with a reflective visor, maintains a steady posture as the bike moves smoothly forward. The snow sparkles under the bright midday sun, adding a sense of brilliance to the scene. The camera follows closely behind, capturing the motion and the spray of snow kicked up by the tires, with the vast, snowy landscape stretching out towards a distant line of trees in the background." + }, + { + "prompt_en": "a motorcycle accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek motorcycle is seen stationary on a flat, open road. As the video begins, the motorcycle starts to accelerate rapidly, its engine producing a powerful roar. The rider leans forward slightly, dressed in a black leather jacket and helmet, gripping the handlebars firmly. The wheels spin faster, and the motorcycle begins to move swiftly away from the camera. Dust and small pebbles kick up behind the rear wheel, and the surrounding landscape blurs with the increasing speed, adding to the sense of momentum and dynamism in the scene." + }, + { + "prompt_en": "an airplane soaring through a clear blue sky", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "An airplane is soaring gracefully through a clear blue sky, leaving behind a streak of jet contrails. The sunlight glistens off the airplane's metallic surface, highlighting its sleek design. As the camera follows the aircraft's path, it captures the serene expanse of the sky, emphasizing the airplane's steady and smooth movement across the vast openness. The winglets are subtly silhouetted against the brightness of the sun, adding to the aerial scene's elegance and tranquility." + }, + { + "prompt_en": "an airplane taking off", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "An airplane is taking off from a runway, the aircraft's sleek body gaining speed as it moves toward the camera. The sun is setting in the background, casting an orange glow across the sky and the runway. As the airplane lifts off the ground, its wings tilt slightly upward, and the landing gear begins to retract. The camera follows the plane's ascent, capturing the runway and surrounding landscape fading into the distance. The atmosphere is charged with the anticipation of flight and the power of the engines." + }, + { + "prompt_en": "an airplane landing smoothly on a runway", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "An airplane gracefully descends towards the runway, with its landing lights brightly illuminating its path. The sky is clear and blue, with a hint of the setting sun providing a warm backdrop. The airplane's sleek body and extended landing gear are clearly visible as it approaches the tarmac. The camera smoothly follows the aircraft's descent, capturing the moment when its wheels make gentle contact with the runway. A slight dust cloud forms as it lands, and the airplane continues to roll forward, gradually slowing down. The scene conveys a sense of precision and calmness." + }, + { + "prompt_en": "an airplane accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "An airplane is seen from a side view as it accelerates along an airport runway, ready to gain speed for takeoff. The plane's sleek metallic body catches the sunlight, creating a reflective effect. The engines roar with power, emitting visible streams of exhaust as the wheels roll swiftly on the tarmac. The camera smoothly tracks the airplane's motion, highlighting its impressive acceleration and the dynamic nature of the scene. The background features other airport facilities and a clear blue sky." + }, + { + "prompt_en": "a bus turning a corner", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A large city bus in bright colors is seen turning a corner smoothly on a bustling street. The bus's tires navigate the curve with precision, and its windows reflect the surrounding buildings and streetlights. The scene is set in the late afternoon, with soft, ambient lighting casting shadows on the road. As the bus turns, the camera slightly pans to follow its movement, capturing the fluid arc of its journey around the corner. Pedestrians can be seen on the sidewalk, adding to the city's lively atmosphere." + }, + { + "prompt_en": "a bus stuck in traffic during rush hour", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A bus is stuck in heavy traffic during rush hour, surrounded by a dense line of vehicles on all sides. The bus is painted in bright colors and stands out among the other cars. Inside, passengers can be seen through the windows, some appearing restless and others absorbed in their phones or books. The city skyline is faintly visible in the background, with buildings reflecting the golden hues of the setting sun. The scene is lively with the hustle and bustle typical of a busy urban environment, while horns occasionally blare, adding to the bustling atmosphere." + }, + { + "prompt_en": "a bus accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, modern bus begins to accelerate on a wide open road, moving away from the camera. Its shiny exterior catches the light, reflecting the bright sky above. The bus smoothly picks up speed, its wheels spinning faster, as it leaves the city skyline in the background. The camera pans to follow the bus, maintaining focus on its rapid movement and powerful forward motion. Dust kicks up slightly from the road, creating a sense of intensity and energy." + }, + { + "prompt_en": "a train speeding down the tracks", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, modern train is speeding down the tracks with great velocity, its streamlined design enhanced by the sunlight glinting off its metallic surface. The train moves swiftly from the left side of the frame towards the right, with the tracks extending into the distance. The landscape around the tracks is blurred due to the train's speed, emphasizing its rapid motion. The camera follows the train smoothly as it continues down the line, creating an exhilarating sense of movement." + }, + { + "prompt_en": "a train crossing over a tall bridge", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek, modern train speeds across a tall bridge, set against a clear blue sky. The bridge's impressive height is emphasized by its strong, arched supports visible beneath it. As the train races from the left to the right side of the frame, its rapid movement creates a subtle motion blur. Its metallic surface gleams in the sunlight, accentuating its streamlined design. Below, a scenic landscape unfolds with lush greenery, while far in the distance, gentle hills enhance the majestic setting. The camera follows the train at a slight upward angle, capturing the grandeur of the bridge and the swift motion of the train in a sweeping, panoramic view." + }, + { + "prompt_en": "a train accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek modern train begins to accelerate along a set of steel tracks surrounded by verdant green trees. As the train picks up speed, its aerodynamic design makes it appear to slice swiftly through the air. The camera pans smoothly alongside the train, showcasing its shiny exterior and highlighting the contrast of its metallic body against the natural background. The train's powerful motion is emphasized by the blur of the trees as it races past. The setting sun in the sky casts a warm, golden glow, enhancing the sense of dynamic speed and energy." + }, + { + "prompt_en": "a truck turning a corner", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A large red truck is seen turning a corner on a paved road. The truck's headlights are on, illuminating its path as it navigates the turn. The camera follows the truck's front side, capturing its wheels as they smoothly maneuver around the corner. The backdrop includes a few trees and a clear blue sky, creating a vibrant and energetic atmosphere. The truck's motion is fluid, and its engine emits a low, steady rumble as it completes the turn." + }, + { + "prompt_en": "a truck anchored in a tranquil bay", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A truck is parked on a small sandy hill overlooking a tranquil bay. In the background, the calm water of the bay reflects the blue sky and surrounding greenery. The truck, painted in a vivid shade of blue, is positioned at an angle, with its front facing slightly towards the bay. The scene is set in the late afternoon, with soft sunlight creating gentle shadows around the truck. Trees and hills can be seen in the distance, adding to the serene and peaceful atmosphere of the location." + }, + { + "prompt_en": "a truck stuck in traffic during rush hour", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A large blue truck is caught in a dense traffic jam during rush hour, with vehicles surrounding it on all sides. The camera captures the scene from a slightly elevated angle, emphasizing the tightly packed rows of cars and buses. The truck is stationary, with its headlights on, reflecting off the bumper of the car in front. A mix of brake lights and indicator signals creates a glow of red and orange hues in the dimming light, conveying the bustling yet frustrating atmosphere of rush hour. The camera pans slowly to the left, revealing more of the congested road filled with vehicles." + }, + { + "prompt_en": "a truck slowing down to stop", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A large, blue truck is slowing down as it approaches a stop. The truck's painted metallic surface gleams under the bright sunlight. Its tires leave faint tracks on the road as they rotate gradually slower. The camera follows the truck from the side, capturing the motion and energy of the vehicle as it decelerates. The truck's exhaust releases a gentle puff of smoke that disperses quickly into the air. In the background, distant trees and a clear sky frame the scene." + }, + { + "prompt_en": "a truck accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A robust, red truck starts from a standstill and begins accelerating down a straight highway. The camera pans behind the truck, capturing the powerful motion as it quickly gains speed. The engine roars as the truck's tires spin and grip the asphalt, while the world around appears to blur with the increasing speed. Dust and small debris are kicked up by the tires, creating a dynamic scene that emphasizes the truck's power and acceleration. The lighting is bright and clear, highlighting the truck's sleek and shiny exterior as it moves further away from the camera." + }, + { + "prompt_en": "a boat sailing smoothly on a calm lake", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A small wooden boat with a single sail glides smoothly across the surface of a serene lake. The water is exceptionally calm, reflecting the soft hues of a clear blue sky. The boat is painted in soft, earthy tones, and its sail gently billows in the light breeze. As the boat moves steadily, the peacefulness of the scene is complemented by the ripples forming behind it. The camera pans slowly from left to right, capturing the entire scene, including the lush greenery along the shoreline in the distance." + }, + { + "prompt_en": "a boat slowing down to stop", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A small motorboat is gliding across a calm lake, its engine causing gentle ripples on the water's surface. As the boat approaches the camera, its speed gradually decreases, and the water begins to settle around it. The boat is white with a dark blue stripe and has a single occupant, a man in his late 30s with windswept hair, wearing a light gray jacket and sunglasses. He turns slightly to the side, looking ahead as his hands adjust the steering. The sky above is clear, with soft sunlight reflecting off the water, creating a serene and tranquil atmosphere. The camera pans slightly to follow the boat's movement, capturing its graceful deceleration until it comes to a complete stop." + }, + { + "prompt_en": "a boat accelerating to gain speed", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sleek motorboat is seen accelerating on a vast, open lake, with the water glistening under bright sunlight. The boat's powerful engine roars as it slices through the water, leaving a frothy wake behind. The boat picks up speed, its bow lifting slightly as it speeds away from the camera, towards the distant horizon. The surrounding water is a brilliant blue, and small waves ripple outward from the boat, adding to the impression of speed and energy. The camera pans to follow the boat's swift movement, capturing the dynamic motion and sparkling water." + }, + { + "prompt_en": "a bird soaring gracefully in the sky", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A majestic bird soars gracefully across the clear blue sky, its wings fully extended and gliding effortlessly on the wind. The sunlight highlights the bird's outstretched wings and sleek body, creating a magnificent display of nature's elegance. The camera follows the bird's flight from below, capturing the serene movement as it navigates through the open air. The soft sunlight enhances the tranquil atmosphere, emphasizing the bird's majestic presence against the vast expanse of the sky." + }, + { + "prompt_en": "a bird building a nest from twigs and leaves", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A small, colorful bird is perched on a branch, carefully placing a twig into a nest it is building. The bird, with vibrant plumage, flutters its wings slightly as it adjusts the twig, ensuring it fits snugly into the growing structure. The nest, made of intertwined twigs and leaves, is nestled securely among the branches and green foliage, creating a serene natural setting. The bird's eyes are focused and attentive, suggesting its dedication to crafting the perfect home. The camera slowly zooms in to capture the intricate details of the nest and the bird's graceful movement." + }, + { + "prompt_en": "a bird flying over a snowy forest", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A majestic bird glides gracefully over a vast, snowy forest. The sky is a crisp, pale blue, contrasting beautifully with the blanket of snow below. As the bird soars forward, its wings spread wide, casting a shadow on the snow-covered treetops. The sun's rays glisten on the snow, creating a sparkling effect that adds to the serene and pristine atmosphere. The camera follows the bird from behind, capturing the expansive view of the forest and the bird's elegant flight path." + }, + { + "prompt_en": "a cat grooming itself meticulously with its tongue", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A cat with soft, fluffy fur and striking green eyes is seated on a plush rug. In the soft lighting, the cat gracefully grooms itself, its pink tongue meticulously cleaning its front paw. Its eyes are half-closed, conveying a sense of contentment and focus. The camera slowly zooms in, capturing the gentle and rhythmic motion of the cat's tongue as it moves across its fur. The atmosphere is calm and soothing, with the cat's fur catching the light, highlighting its pristine appearance." + }, + { + "prompt_en": "a cat playing in park", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A playful cat is frolicking in a lush, green park. The sun is shining brightly, creating a cheerful atmosphere. The cat, with a sleek coat of fur and bright eyes, pounces around on the well-manicured grass. It bats at a small leaf fluttering in the breeze, its tail flicking excitedly. In the background, tall trees sway gently, and vibrant flowers add bursts of color to the scene. The camera follows the cat's energetic movements, capturing the sense of joy and freedom in its playful antics." + }, + { + "prompt_en": "a cat drinking water", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A charming cat, with soft, fluffy fur in a mix of white and gray, stands by a gleaming stainless steel bowl filled with water. The setting is warmly lit, creating a cozy ambiance. The cat lowers its head delicately to lap up the water, its pink tongue flicking in and out as tiny ripples form on the water's surface. Its eyes are serene and focused on the bowl. The camera gently zooms in, capturing the intimate moment where the cat's whiskers slightly brush against the water's edge." + }, + { + "prompt_en": "a cat running happily", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A fluffy cat with a gleaming coat is running playfully in an open grassy field. Its tail is raised high, and its eyes are wide with excitement. The cat moves towards the camera with graceful agility, leaping over patches of flowers as it runs. Sunlight bathes the scene, casting gentle shadows across the grass. As the camera follows the cat's joyful sprint, the playful and carefree essence of the moment is captured beautifully." + }, + { + "prompt_en": "a dog enjoying a peaceful walk", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A fluffy golden retriever is enjoying a leisurely walk along a sunlit forest path. The dog trots contentedly, with its tail wagging gently and ears perked up. Sunlight filters through the trees, casting dappled patterns on the ground. The golden retriever occasionally sniffs the surrounding foliage, clearly relishing the tranquil environment. The scene captures the serene ambiance of a peaceful nature walk, enhanced by the gentle rustling of leaves in the light breeze. The camera follows the dog from behind, maintaining a steady focus as it gracefully moves down the path." + }, + { + "prompt_en": "a dog playing in park", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A playful dog is enthusiastically running through a lush green park. The dog, with a shiny coat, bounds across the soft grass, its tail wagging happily. The dog leaps occasionally, its ears flopping, as the sun bathes the scene in a warm, inviting glow. The park is dotted with a few blooming flowers, and trees cast gentle shadows on the grass, creating a lively and joyful atmosphere. The camera follows the dog closely, capturing its dynamic and spirited movements from the side." + }, + { + "prompt_en": "a dog drinking water", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A fluffy dog is leisurely drinking water from a stainless steel bowl placed on a wooden deck. The dog has a glossy coat and is charmingly fluffy, with expressive, bright eyes. As it drinks, its ears are perked up attentively. The gentle sunlight filters through the nearby trees, creating a warm and serene atmosphere with dappled patterns of light on the deck. The camera captures a close-up view of the dog's face, focusing on its content expression and the slight ripples in the water as it drinks." + }, + { + "prompt_en": "a dog running happily", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A playful dog runs happily across a lush green field, its fur shimmering in the sunlight. The dog's ears perk up with excitement, and its tail wags energetically. As it dashes towards the camera, the dog's eyes sparkle with joy and its tongue lolls out in a carefree expression. The scene is filled with the vibrant energy of the dog's movements, with grass blades swaying in the gentle breeze. The camera smoothly follows the dog, capturing its joyful romp up close, with the bright sky providing a cheerful backdrop." + }, + { + "prompt_en": "a horse bending down to drink water from a river", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A majestic horse running to the river, stands at the edge of a tranquil river, gracefully bending down to drink the clear water. Its sleek coat gleams under the soft sunlight, highlighting its strong, muscular form. The horse's mane gently flows in the breeze as it lowers its head to the water's surface. The scene is serene and peaceful, with the gentle sound of the river flowing in the background. The camera slowly pans closer, capturing the refreshing splash as the horse takes a sip, surrounded by the lush greenery lining the riverbank." + }, + { + "prompt_en": "a horse galloping across an open field", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A magnificent horse with a sleek, shiny coat is galloping across an open field, its mane and tail flowing gracefully in the wind. The sun casts a golden hue over the expansive field, which is covered in lush green grass. As the horse runs towards the right side of the frame, its powerful muscles ripple with each stride. The scene is vibrant and full of energy, underlined by the rhythmic pounding of hooves on the ground. The camera pans smoothly to follow the horse's movement, capturing the essence of freedom and agility." + }, + { + "prompt_en": "a horse taking a peaceful walk", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A majestic horse with a glossy coat is taking a peaceful walk along a tranquil dirt path. The surroundings are lush and green, with tall trees lining both sides of the path and sunlight filtering through the leaves, casting dappled shadows on the ground. The horse walks gracefully, its mane gently swaying with each step. The camera follows alongside the horse, capturing its calm demeanor and the serene beauty of the natural setting." + }, + { + "prompt_en": "a horse running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A majestic horse with a glossy coat gallops across a sprawling meadow, its muscles rippling as it moves with powerful strides. The horse is running at a swift pace, its mane flowing in the breeze, as it heads towards a distant herd of fellow horses. The herd, comprising various colors and sizes, is grazing peacefully on the lush, green grass, with a few lifting their heads to watch the approaching horse. The sun hangs low in the sky, casting a golden hue over the scene, creating an atmosphere of tranquility and unity. The camera follows closely behind the horse, capturing its determined movement towards the herd." + }, + { + "prompt_en": "a sheep bending down to drink water from a river", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A fluffy sheep with a pristine white fleece runs quickly to the river and bends down near the riverbank, its head lowered to sip water from the gently flowing river. The surrounding area is lush and green, with the sunlight filtering through the trees, casting dappled shadows on the water's surface. The sheep's reflection shimmers in the clear water as it drinks, creating a serene and peaceful atmosphere. The camera angle captures the scene from a low perspective, focusing on the sheep's gentle motions and the tranquil aquatic setting." + }, + { + "prompt_en": "a sheep taking a peaceful walk", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A single sheep is taking a peaceful walk across a grassy meadow, its wooly coat glistening softly under the gentle morning sunlight. The sheep ambles leisurely, moving from the left side of the frame to the right, its head occasionally dipping down to graze on the lush greenery. The calm and serene atmosphere is enhanced by the soft rustling of grass in the light breeze. The camera follows the sheep smoothly, maintaining a steady focus as it traverses the peaceful landscape." + }, + { + "prompt_en": "a sheep running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A sheep with soft, fluffy wool is seen running towards a herd of its kind in a scenic pasture. The sheep's wool glistens under the warm sunlight as it moves swiftly across the green grass. In the background, the herd of sheep is grazing peacefully, creating a cozy and serene scene. The running sheep kicks up small tufts of grass with each step, and its movement is smooth and determined. The camera smoothly follows the sheep's approach, capturing its urgency and excitement of rejoining the group." + }, + { + "prompt_en": "a cow bending down to drink water from a river", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A cow stands near the edge of a tranquil river, its body casting a soft shadow on the water. The cow is bending its head down, gently touching the surface of the river with its nose as it takes a sip. Its coat is a rich brown with white patches, and the water reflects these colors with a shimmering effect. The surrounding area features lush green grass and a few scattered rocks, contributing to a serene, pastoral atmosphere. The camera smoothly zooms in towards the cow, capturing the details of its gentle motion and the ripples in the water." + }, + { + "prompt_en": "a cow chewing cud while resting in a tranquil barn", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "In a tranquil barn with warm, soft lighting, a cow is reclining comfortably on a bed of straw, chewing cud contentedly. The cow has a glossy coat with distinctive black and white patches. Its gentle eyes blink slowly as it moves its jaw rhythmically. The barn is constructed with wooden beams, casting subtle shadows across the barn floor. The camera captures a close-up of the cow's serene expression and the surrounding tranquil environment. Dust motes float lazily in the air, adding to the peaceful atmosphere." + }, + { + "prompt_en": "a cow running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A cow is seen running across a grassy field, moving towards a herd of other cows in the distance. The scene is bathed in the golden light of a setting sun, creating a warm and serene atmosphere. The cow, with a shiny brown coat and strong build, moves with a determined pace, its ears perked up as it approaches the herd. Dust rises gently in its wake, adding a sense of motion to the scene. The camera smoothly pans to follow the cow's movement, capturing the herd growing larger as it nears." + }, + { + "prompt_en": "an elephant spraying itself with water using its trunk to cool down", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "In the warm, sunlit setting of a savannah, a majestic elephant stands near a small waterhole. The elephant gracefully lifts its powerful trunk and sprays a refreshing arc of water over its back, creating sparkling droplets in the sunlight. The water cascades down its thick, gray skin, offering a momentary shimmer before being absorbed by the heat. The camera pans around the elephant, capturing this beautiful and natural cooling ritual from various angles, with the golden light enhancing the serene atmosphere." + }, + { + "prompt_en": "an elephant taking a peaceful walk", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "An elephant is taking a peaceful walk along a dirt path surrounded by tall grass and scattered trees. The camera follows smoothly alongside the elephant, capturing its gentle and rhythmic movements. The sun casts a golden light over the scene, creating a serene and warm atmosphere. The elephant's large, graceful steps create soft thuds on the ground, and its ears gently flap as it moves forward, giving an impression of calmness and tranquility in the natural surroundings." + }, + { + "prompt_en": "an elephant running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "An elephant is seen running across a dusty terrain, heading towards a herd of other elephants in the near distance. The elephant's massive ears flap with each stride, and its trunk sways rhythmically as it moves. The sunlight glints off its gray, textured skin, highlighting the patterns of its wrinkles. The herd, standing in an open landscape, awaits its arrival. Dust kicks up around the elephant's feet, creating a soft haze in the warm sunlight, enhancing the sense of motion." + }, + { + "prompt_en": "a bear catching a salmon in its powerful jaws", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A large brown bear stands partially submerged in a rushing river, its fur wet and glistening under the sunlight. The bear's eyes are sharply focused as it lunges forward, its powerful jaws open wide. In a swift, fluid motion, the bear captures a leaping salmon in its jaws, with the fish's silvery body gleaming in the light. Water splashes around the bear's rounded body, highlighting the intensity and strength of the moment, while the surrounding riverbanks create a natural and rugged atmosphere. The camera smoothly pans to capture the bear's triumphant stance amidst the spray of the river." + }, + { + "prompt_en": "a bear sniffing the air for scents of food", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A bear, with a thick coat of brown fur, stands on the edge of a forest clearing. The bear opens its mouth slightly and raises its snout towards the sky, sniffing the air for the scent of food. The sunlight filters through the trees, casting patches of light and shadow on the ground. The bear's large paws and muscular body are visible as it stands still, intently focused on the scents carried by the wind. The camera slowly circles around the bear, providing a detailed view of its concentrated expression and its surroundings." + }, + { + "prompt_en": "a bear climbing a tree", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A bear is seen ascending a tall tree trunk, its powerful claws gripping the bark as it climbs upwards. The bear's fur is thick and shaggy, with shades of deep brown contrasting against the green foliage. The camera captures the bear from a side angle, moving slightly upwards to follow its progress. Sunlight filters through the leaves, casting dappled patterns of light and shadow around the scene. The bear moves steadily, occasionally glancing around the environment, showcasing its strength and agility in a calm forest atmosphere." + }, + { + "prompt_en": "a bear hunting for prey", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A powerful brown bear is moving purposefully through a dense forest, scanning the ground ahead for prey. The sunlight filters through the trees, creating dappled patterns on the forest floor. The bear's thick fur and muscular build add to its imposing presence as it sniffs the air, searching intently for signs of movement. Its eyes are focused, and its movements are calculated and deliberate. The camera follows closely, capturing the bear's every step as it navigates through the woodland, surrounded by tall trees and underbrush." + }, + { + "prompt_en": "a zebra bending down to drink water from a river", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A zebra runs quickly to the river and stands gracefully at the edge of a gently flowing river, bending its neck down to drink the clear water. The zebra's striped pattern contrasts with the surrounding landscape of green vegetation. Its ears are perked up, and its tail sways slightly, adding a sense of calm and serenity to the scene. The sunlight glistens on the river's surface, creating a peaceful and natural atmosphere. The camera captures the scene with a gentle zoom, highlighting the tranquil interaction between the zebra and its environment." + }, + { + "prompt_en": "a zebra running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A zebra is energetically galloping across an open grassy savannah, moving swiftly toward a herd of zebras grazing in the distance. The sun casts a golden light over the scene, highlighting the zebra's distinctive black and white stripes as it runs. The camera follows the zebra's movement from a low angle, capturing the dust kicked up by its hooves. As the zebra nears the herd, the other zebras lift their heads to acknowledge its arrival. The scene conveys a sense of unity and vibrant life in the wild." + }, + { + "prompt_en": "a zebra taking a peaceful walk", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A zebra, with its distinct black and white stripes, is taking a peaceful walk through a grassy savanna. The zebra moves steadily and gracefully, its tail gently swaying with each step as it walks away from the camera. The surrounding landscape is dotted with a few acacia trees and stretches under a clear blue sky. Soft sunlight bathes the scene, enhancing the natural beauty and tranquility of the setting. The camera follows the zebra's movement, capturing the serene ambiance of the savanna." + }, + { + "prompt_en": "a giraffe bending down to drink water from a river", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A giraffe strolls gracefully across an open savannah, its long neck swaying gently with each step. The landscape is bathed in the warm, golden light of late afternoon, creating a serene atmosphere. The giraffe's distinctive coat patterns stand out against the dry grass and sparse, green acacia trees. As it walks towards the right, the camera follows smoothly, capturing the gentle rhythm of its movement and the beauty of the surrounding natural environment." + }, + { + "prompt_en": "a giraffe taking a peaceful walk", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A giraffe strolls gracefully across an open savannah, its long neck swaying gently with each step. The landscape is bathed in the warm, golden light of late afternoon, creating a serene atmosphere. The giraffe's distinctive coat patterns stand out against the dry grass and sparse, green acacia trees. As it walks towards the right, the camera follows smoothly, capturing the gentle rhythm of its movement and the beauty of the surrounding natural environment." + }, + { + "prompt_en": "a giraffe running to join a herd of its kind", + "dimension": [ + "subject_consistency", + "dynamic_degree", + "motion_smoothness" + ], + "refined_prompt": "A graceful giraffe, with its long neck and distinctive patches, is seen running gracefully across the savannah towards a distant herd of other giraffes. The terrain is a mix of golden grasslands and scattered acacia trees, bathed in the warm light of the afternoon sun. As the giraffe moves swiftly, its legs stretch out in a rhythmic motion, and its tail gently sways side to side. The camera follows the giraffe, panning smoothly to capture its approach, until it reaches the group, blending seamlessly with the other giraffes in the herd." + }, + { + "prompt_en": "a person", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "person" + } + }, + "refined_prompt": "A young and beautiful woman is casually sitting on a park bench surrounded by lush greenery. She has long, flowing hair and is wearing a stylish, light summer dress with floral patterns. Her eyes are bright and animated as she gazes thoughtfully into the distance, with a gentle smile gracing her lips. Her posture is relaxed, with one leg crossed over the other. The scene is bathed in warm, natural light, creating a serene and peaceful atmosphere. The camera smoothly pans from right to left to capture her elegant profile against the backdrop of the vibrant park setting." + }, + { + "prompt_en": "a bicycle", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bicycle" + } + }, + "refined_prompt": "A sleek, modern bicycle rests upright on a paved path surrounded by lush greenery. The bicycle has a metallic frame with a matte finish, sleek black tires, and a comfortable saddle. Sunlight filters through the leaves of the trees above, casting dappled patterns of light and shadow on the path. The camera pans slowly from the front wheel to the back, capturing the bicycle's streamlined design and the tranquil setting." + }, + { + "prompt_en": "a car", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "car" + } + }, + "refined_prompt": "A sleek, red sports car is parked at an angle on a sunlit road, surrounded by vibrant green trees in the background. The car's glossy surface reflects the sunlight, accentuating its smooth curves and stylish design. The windows are slightly tinted, and the polished chrome wheels add a touch of luxury. The camera slowly pans from the front to the side, showcasing the car's elegant profile. The overall atmosphere is bright and serene, highlighting the car as the focal point." + }, + { + "prompt_en": "a motorcycle", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "motorcycle" + } + }, + "refined_prompt": "A sleek motorcycle is parked on a quiet, open road surrounded by a picturesque natural landscape. The motorcycle has a shiny black and chrome finish that reflects the ambient light, highlighting its elegant curves. The road stretches into the distance, bordered by lush green fields and a blue sky above. The atmosphere is peaceful, with a gentle breeze suggested by the movement of nearby grass. The camera slowly pans from left to right, capturing the motorcycle from various angles and emphasizing its polished design." + }, + { + "prompt_en": "an airplane", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "airplane" + } + }, + "refined_prompt": "A sleek, modern airplane is captured from a side view as it soars through a clear blue sky. The sunlight reflects off its polished metal surface, emphasizing the plane's streamlined design. The airplane's wings are slightly tilted, indicating a gentle ascent. Contrails form behind its engines, adding to the sense of motion and speed. The camera pans slowly from left to right, following the airplane's graceful movement against the vast sky." + }, + { + "prompt_en": "a bus", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bus" + } + }, + "refined_prompt": "A large blue bus is parked at a bustling city street, with the sun reflecting off its windows. The bus has sleek, modern lines and several large windows along the side, offering a glimpse of its comfortable interior. The wheels are shiny and well-maintained, contributing to the bus's polished appearance. The camera starts with a wide angle, showing the bus from the front and slightly to the side, capturing pedestrians walking nearby and other vehicles on the road. As the camera moves closer, it pans along the side of the bus, emphasizing its size and design details." + }, + { + "prompt_en": "a train", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "train" + } + }, + "refined_prompt": "A sleek, modern train is gliding smoothly along a set of tracks, moving towards the camera. The silver exterior of the train reflects the surrounding landscape, adding a dynamic element to its motion. The train's headlights are on, casting a bright glow on the tracks ahead. Alongside the train, a lush green countryside is visible, providing a serene backdrop. As the train passes by, there's a slight whooshing sound, emphasizing its speed and efficiency. The camera pans slightly to follow the train's movement, capturing its journey through the picturesque landscape." + }, + { + "prompt_en": "a truck", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "truck" + } + }, + "refined_prompt": "A large, shiny red truck is parked in front of a warehouse on a sunny day, providing a backdrop of blue sky and scattered white clouds. The truck is sleek and well-polished, reflecting the light off its surface. The camera pans smoothly from the front to the side, highlighting its robust grill and large tires. The warehouse in the background has an industrial look with metal siding and large bay doors. The scene conveys a sense of power and reliability, emphasizing the truck's impressive size and design." + }, + { + "prompt_en": "a boat", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "boat" + } + }, + "refined_prompt": "A small wooden boat with a polished finish gently floats on clear, calm water. The boat is equipped with sturdy oars resting along the sides. The camera slowly moves closer, revealing the boat's elegant design and the soft reflections of the sky and surrounding landscape in the water. A few ripples form around the boat, creating a serene and tranquil atmosphere." + }, + { + "prompt_en": "a traffic light", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "traffic light" + } + }, + "refined_prompt": "A close-up view of a traffic light is shown against a slightly overcast sky. The traffic light features three circular lenses: red at the top, yellow in the middle, and green at the bottom. The red light is illuminated, casting a bright hue onto the surrounding metal casing. A gentle breeze sways the cables supporting the traffic light, adding a subtle motion to the scene. The camera slowly zooms out, providing a broader perspective of the traffic light and hinting at the urban environment around it." + }, + { + "prompt_en": "a fire hydrant", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "fire hydrant" + } + }, + "refined_prompt": "A bright red fire hydrant stands on a patch of vibrant green grass. The hydrant's glossy surface reflects the sunlight, creating a vivid contrast with its surroundings. It features two side nozzles and a central cap, all securely fastened. The camera gently pans around the hydrant, providing a view from various angles. The background includes a blurred view of a sidewalk and a few scattered fallen leaves, adding a serene suburban atmosphere to the scene." + }, + { + "prompt_en": "a stop sign", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "stop sign" + } + }, + "refined_prompt": "A bright red stop sign stands prominently at the intersection, surrounded by lush green foliage. The sign is perfectly centered with its white lettering crisp and clear against the red background. Sunlight filters through the leaves, casting dappled shadows on the sign, creating a serene and slightly nostalgic atmosphere. The camera gently pans from the base of the pole upwards to give an unobstructed view of the sign against the sky." + }, + { + "prompt_en": "a parking meter", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "parking meter" + } + }, + "refined_prompt": "A parking meter stands on the edge of a bustling city street. The meter is metallic with a digital display and buttons below it. The sun casts a bright light on its surface, accentuating its sleek contours. Cars are parked along the curb, and pedestrians walk by in the background, adding to the urban atmosphere. The camera pans slowly from the base of the meter upwards, capturing the entirety of its structure against the lively street scene." + }, + { + "prompt_en": "a bench", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bench" + } + }, + "refined_prompt": "A wooden bench is situated in a tranquil park setting, surrounded by lush greenery. Sunlight filters through the tree leaves, casting dappled patterns on the ground. The bench has a simple, classic design with a backrest and armrests, inviting relaxation. Fallen leaves scatter around the base of the bench, adding to the serene atmosphere. The camera slowly pans from left to right, capturing the peaceful scene and the gentle rustling of leaves in the soft breeze." + }, + { + "prompt_en": "a bird", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bird" + } + }, + "refined_prompt": "A vibrant bird sits perched on a slender tree branch. The bird has glossy feathers and a striking coloration, with shades of blue and green shimmering under the soft sunlight. The bird's keen eyes are framed by a delicate beak, and it momentarily tilts its head, giving a curious and alert expression. The camera gently pans to capture the subtle swaying of the branch in the breeze, adding a sense of tranquility and natural beauty to the scene." + }, + { + "prompt_en": "a cat", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "cat" + } + }, + "refined_prompt": "A fluffy cat with soft, orange tabby fur sits on a sunlit windowsill, gazing curiously outside. The sunlight filters through the window, casting a warm and gentle glow on the cat's fur. The cat's large, expressive green eyes are focused intently on something outside, and its ears are perked up with interest. The tail is casually draped over the edge of the windowsill, swaying ever so slightly. The camera gently zooms in, capturing the serene and contemplative expression of the cat, adding a peaceful atmosphere to the scene." + }, + { + "prompt_en": "a dog", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "dog" + } + }, + "refined_prompt": "A playful dog is energetically running towards the camera in an open grassy field. The dog has a shiny coat, perky ears, and a joyful expression, its tongue playfully hanging out. The sunlight bathes the scene in a warm glow, highlighting the dog's fur as it bounds across the grass. The camera captures the dog's happy movement with a slightly upward angle, enhancing the playful atmosphere of the scene." + }, + { + "prompt_en": "a horse", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "horse" + } + }, + "refined_prompt": "A stunning horse stands in a lush green field, its glossy coat gleaming in the sunlight. The horse is a deep chestnut color, with a sleek mane cascading down its neck. It stands gracefully with its head slightly tilted, and its ears are attentively perked up. The background reveals a few scattered trees, adding to the tranquil ambiance. The camera captures the scene with a gentle, slow zoom toward the horse, emphasizing its beauty and strength." + }, + { + "prompt_en": "a sheep", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "sheep" + } + }, + "refined_prompt": "A fluffy white sheep stands in a grassy field, with a backdrop of rolling hills under a clear blue sky. The sheep is facing towards the camera, its soft wool catching the sunlight, giving it a warm and gentle glow. Its ears are perked up, and it looks calm and curious. The camera slowly zooms in to capture the sheep's face, creating a serene and pastoral atmosphere." + }, + { + "prompt_en": "a cow", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "cow" + } + }, + "refined_prompt": "A majestic brown and white cow stands in a lush green pasture, with patches of wildflowers scattered around. Its large, expressive eyes and long lashes give it a gentle appearance. The cow's coat glistens softly in the sunlight, and its ears occasionally flick to ward off insects. In the background, a few trees sway gently in the breeze, adding a serene and natural ambiance to the scene. The camera pans slowly from left to right, capturing the cow's peaceful grazing and the quiet beauty of the surrounding landscape." + }, + { + "prompt_en": "an elephant", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "elephant" + } + }, + "refined_prompt": "An elephant stands majestically under the warm rays of the sun, its large ears flapping gently as it sways. The elephant's ancient, wrinkled skin catches the light, highlighting its sturdy, powerful physique. Its trunk gracefully explores the ground, moving rhythmically as it searches for food. The background features a savanna landscape, with scattered trees and golden grasses stretching out towards the horizon. The camera pans slowly around the elephant, capturing its regal presence and the serene environment." + }, + { + "prompt_en": "a bear", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bear" + } + }, + "refined_prompt": "A majestic bear is walking through a dense forest. The bear has a thick, glossy coat of fur that glimmers under the dappled sunlight filtering through the trees. Its powerful paws make a soft thud against the ground as it moves forward, with a calm and purposeful gait. The bear's ears are alert and its eyes are observant, taking in the surrounding environment. The camera follows the bear's movement, capturing the peaceful and natural atmosphere of the forest." + }, + { + "prompt_en": "a zebra", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "zebra" + } + }, + "refined_prompt": "A zebra stands gracefully on a lush savannah, surrounded by tall golden grasses gently swaying in the breeze. The zebra's distinctive black and white stripes create a captivating pattern under the bright sunlight. It is positioned in profile, facing right, with its mane standing up slightly along its neck. The serene atmosphere is complemented by a clear blue sky overhead. The camera slowly pans from left to right, capturing the elegant posture and natural beauty of the zebra against the vibrant landscape." + }, + { + "prompt_en": "a giraffe", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "giraffe" + } + }, + "refined_prompt": "A lone giraffe stands gracefully in an open savanna, surrounded by tall grasses swaying gently in the breeze. The sun casts a warm, golden light across the landscape, highlighting the giraffe's elegant spots and long neck. The giraffe turns its head slowly, its large eyes scanning the horizon. The scene captures the serene beauty of the wild, with the giraffe as its majestic focal point. The camera pans slightly to reveal more of the surrounding savanna, emphasizing the serene and expansive setting." + }, + { + "prompt_en": "a backpack", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "backpack" + } + }, + "refined_prompt": "A rugged dark green backpack is positioned on a wooden table, with zippers slightly open, revealing the neatly packed contents within. The backpack has multiple compartments and sturdy straps, showcasing both functionality and durability. Sunlight streams in from a nearby window, casting a soft, golden light on the backpack, accentuating its texture and design. The camera slowly zooms in, providing a closer look at the material and the details of the zippers and buckles, creating an inviting and adventurous atmosphere." + }, + { + "prompt_en": "an umbrella", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "umbrella" + } + }, + "refined_prompt": "An open umbrella with a vibrant, multicolored canopy, featuring alternating stripes of red, blue, yellow, and green, stands upright against a soft, cloudy sky backdrop. The camera captures the umbrella from below, highlighting its structure and design as it lightly sways in the breeze. The handle is classic black, sleek, and straight, anchored firmly. Gentle light filters through the fabric, creating a vivid, cheerful atmosphere. The movement of the camera provides a dynamic view of the umbrella as it shifts slightly with the passing breeze." + }, + { + "prompt_en": "a handbag", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "handbag" + } + }, + "refined_prompt": "A stylish handbag is displayed against a neutral background, emphasizing its elegant design. The handbag is made of high-quality leather with a smooth texture and features a sleek, minimalist design. It has a rich, warm brown color with a subtle sheen under soft lighting. The camera gently pans across the handbag, highlighting its gold-tone hardware, such as the buckles and zipper. The structured shape of the handbag is complemented by a pair of sturdy handles resting gracefully on top." + }, + { + "prompt_en": "a tie", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "tie" + } + }, + "refined_prompt": "A close-up view of a luxurious silk tie is draped elegantly over a smooth, polished surface. The tie boasts a deep blue hue with subtle silver stripes running diagonally across its fabric. The texture of the silk catches the light, creating a soft sheen that enhances its opulent appearance. Folded neatly with an end gently curling, the tie portrays a sense of sophistication and style. The camera captures the tie from a slightly angled top-down perspective, emphasizing its fine detailing and rich color." + }, + { + "prompt_en": "a suitcase", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "suitcase" + } + }, + "refined_prompt": "A sleek, black suitcase stands upright on a smooth wooden floor, illuminated by gentle overhead lighting that highlights its glossy finish. The suitcase has sturdy wheels and a telescopic handle, which is fully extended, pointing upwards. The camera slowly circles around the suitcase, showcasing its modern design and subtle textural details on the surface. The calm and minimalistic setting emphasizes the suitcase's contemporary and practical style." + }, + { + "prompt_en": "a frisbee", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "frisbee" + } + }, + "refined_prompt": "A brightly colored frisbee spins gracefully through the air against a clear blue sky. The camera follows the frisbee's smooth trajectory from a side angle, capturing its rapid rotation and vibrant hues as it glides effortlessly. Sunlight catches its edges, highlighting the frisbee's dynamic motion before it begins to descend towards an unseen landing point off-screen." + }, + { + "prompt_en": "skis", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "skis" + } + }, + "refined_prompt": "A pair of skis is positioned upright in the snow, with a backdrop of snow-covered mountains and a clear blue sky. The skis have a sleek, modern design with vibrant red and black patterns running along their length. They are placed close together, with snow gently clinging to their surfaces, creating a sense of freshness. The natural lighting emphasizes the crispness of the scene, and the camera slowly pans from the skis upward, capturing the expansive mountain landscape in the background." + }, + { + "prompt_en": "a snowboard", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "snowboard" + } + }, + "refined_prompt": "A snowboard lies on a snowy mountainside, partially covered by fresh powder. The board features a colorful geometric design with vibrant blues and reds, contrasting sharply against the white snow. The bindings are visible on top, ready for a rider. The camera pans slowly from the tail to the nose of the snowboard, capturing the intricate details of the design and the texture of the snow surrounding it. The scene is set under a clear blue sky, creating a crisp and invigorating atmosphere." + }, + { + "prompt_en": "a sports ball", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "sports ball" + } + }, + "refined_prompt": "A vivid image captures a brightly colored sports ball resting on a lush, green grass field. The ball, intricately designed with a bold, geometric pattern, features bright panels of red, blue, and white, making it stand out against the natural background. Sunlight casts a gentle glow on the scene, highlighting the textures of both the ball and the grass. The camera smoothly pans around the ball, showcasing its design from different angles." + }, + { + "prompt_en": "a kite", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "kite" + } + }, + "refined_prompt": "A brightly colored kite sails through the clear blue sky, its tail fluttering gracefully in the wind. The kite's diamond-shaped body is adorned with a vibrant pattern of red, yellow, and blue, creating a striking contrast against the sky. The long tail sways rhythmically as gentle breezes push the kite upwards. The camera follows the kite's movement, capturing its gentle ascent and the mesmerizing dance of the tail in the air." + }, + { + "prompt_en": "a baseball bat", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "baseball bat" + } + }, + "refined_prompt": "A close-up view showcases a polished wooden baseball bat lying on a soft patch of grass. The bat's smooth surface glistens under the soft, ambient sunlight filtering from above, highlighting its rich wood grain. The camera slowly pans from the handle to the barrel, capturing the bat's sleek design and glossy finish. The gentle sway of nearby blades of grass adds a touch of natural elegance to the scene." + }, + { + "prompt_en": "a baseball glove", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "baseball glove" + } + }, + "refined_prompt": "A close-up view of a well-worn baseball glove lies on a wooden surface, showcasing its rich, brown leather and intricate stitching. The texture of the leather is visible, and the glove is slightly open, revealing its inner padding. Soft, natural light from a nearby window casts gentle shadows, adding depth to the scene. The camera slowly pans across the glove, highlighting its craftsmanship and the subtle creases that speak to years of use." + }, + { + "prompt_en": "a skateboard", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "skateboard" + } + }, + "refined_prompt": "A close-up view shows a skateboard on the ground, highlighting its vibrant, multicolored deck with an abstract pattern. The skateboard is positioned at an angle, with its polished metal trucks and smooth, red wheels in clear focus. The lighting casts subtle shadows, emphasizing the smooth texture of the deck's surface. The camera makes a slow, sweeping motion around the skateboard, capturing every detail as it transitions from the front to the back of the skateboard, showcasing its sleek design." + }, + { + "prompt_en": "a surfboard", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "surfboard" + } + }, + "refined_prompt": "A vibrant surfboard is positioned upright in the sand on a sunlit beach. The surfboard features eye-catching patterns in shades of blue and yellow, which stand out against the sandy backdrop. The sunlight gleams off the board's surface, highlighting its sleek design. The gentle waves of the ocean can be seen in the background, with a few seagulls flying overhead, adding to the lively beach atmosphere. The camera slowly pans from left to right, capturing the beauty of the surfboard and its surroundings." + }, + { + "prompt_en": "a tennis racket", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "tennis racket" + } + }, + "refined_prompt": "A close-up view of a tennis racket showcases its intricate details. The racket has a sleek, glossy black frame intertwined with vibrant green accents. The strings are tightly woven, forming a symmetrical pattern across the head of the racket. The grip, wrapped in a black cushioned material, displays a distinct texture for a firm hold. The camera slowly pans from the head to the handle, highlighting the craftsmanship and design of the tennis racket against a softly lit background." + }, + { + "prompt_en": "a bottle", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bottle" + } + }, + "refined_prompt": "A sleek, transparent glass bottle filled with a light blue liquid is set against a softly illuminated background. The bottle features a minimalist design with a simple silver cap, reflecting light gently. As the camera slowly circles the bottle, the liquid inside shimmers slightly, catching the glow from the ambient lighting. The overall atmosphere is calm and sophisticated, highlighting the bottle's elegance and clarity." + }, + { + "prompt_en": "a wine glass", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "wine glass" + } + }, + "refined_prompt": "A crystal-clear wine glass sits elegantly on a wooden table. The glass is half-filled with a deep red wine, and its surface catches the soft, ambient light from above, creating subtle reflections and highlights. The rich color of the wine contrasts beautifully with the transparency of the glass. The camera smoothly zooms in, capturing the delicate silhouette of the glass and the gentle play of light on its curves." + }, + { + "prompt_en": "a cup", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "cup" + } + }, + "refined_prompt": "A close-up shot focuses on a ceramic coffee cup with a sleek, white finish, sitting on a smooth wooden surface. Steam rises gently from the hot coffee inside, creating a warm and inviting atmosphere. The sunlight filters in from a nearby window, casting a soft, natural light on the scene and highlighting the cup's glossy texture. The camera slowly pans around the cup, capturing its elegant handle and the subtle shadows it casts on the table." + }, + { + "prompt_en": "a fork", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "fork" + } + }, + "refined_prompt": "A sleek, shiny silver fork is lying on a white marble surface. The camera captures the fork from a side angle, slowly panning from the handle to the prongs. The reflection of the fork glimmers on the marble surface, accentuating its polished appearance. The lighting is bright and focused, highlighting the smooth texture and contours of the fork. The scene has a clean and minimalist ambiance." + }, + { + "prompt_en": "a knife", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "knife" + } + }, + "refined_prompt": "A gleaming kitchen knife is placed on a cutting board in a brightly lit kitchen. The knife features a polished stainless steel blade with a sharp edge, and its handle is made of sleek, dark wood with visible grain patterns. The light reflects off the blade, highlighting its sharpness and precision. The scene is set in a crisp, clean kitchen environment, with subtle shadows accentuating the knife's form. The camera slowly moves around the knife, showcasing its contours and craftsmanship." + }, + { + "prompt_en": "a spoon", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "spoon" + } + }, + "refined_prompt": "A shiny silver spoon is resting on a smooth, dark wooden table. The camera smoothly zooms in to capture the spoon's sleek and polished surface, which reflects the ambient lighting in the room. The gentle lighting highlights the spoon's curved handle and elegant shape, creating a serene and simple atmosphere. The close-up view allows the intricate details of the spoon's design to be appreciated, as the camera slowly pans from the handle to the spoon's bowl." + }, + { + "prompt_en": "a bowl", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bowl" + } + }, + "refined_prompt": "A simple yet elegant ceramic bowl is placed on a wooden table. The bowl has a glossy white finish with a subtle blue pattern along the rim. The light above casts a gentle glow on the bowl's surface, accentuating its smooth texture. The camera slowly circles around the bowl, showcasing its symmetry and delicate design. The calm and serene atmosphere highlights the bowl's understated beauty and craftsmanship." + }, + { + "prompt_en": "a banana", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "banana" + } + }, + "refined_prompt": "A ripe banana is placed neatly on a wooden table surface, with its vibrant yellow peel contrasting against the warm brown of the wood. The banana has a gentle curve and a smooth texture, with one end slightly opening to reveal a hint of its creamy interior. Soft, natural light filters in from the side, highlighting the subtle spots on the peel and giving the banana a fresh and inviting appearance. The camera captures a close-up shot, slowly rotating to showcase the banana's appealing shape and color." + }, + { + "prompt_en": "an apple", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "apple" + } + }, + "refined_prompt": "A ripe, glossy red apple sits on a rustic wooden table, bathed in soft, natural light from a nearby window. The surface of the apple is flawless, with vibrant hues and a subtle sheen that highlights its round, plump shape. Next to it, a single green leaf is attached to the slender brown stem, adding a touch of contrast to the scene. The camera slowly zooms in, focusing on the apple's immaculate surface, capturing its appealing freshness and texture." + }, + { + "prompt_en": "a sandwich", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "sandwich" + } + }, + "refined_prompt": "A close-up view of a freshly-made sandwich placed on a wooden cutting board. The sandwich is generously filled with slices of crispy lettuce, juicy red tomato, and layers of savory ham and cheese. The bread, toasted to a golden brown, is slightly tilted to reveal the vibrant colors and textures of the ingredients. Soft, diffused lighting highlights the freshness of the ingredients, adding a mouth-watering effect to the scene. The camera subtly moves in a slight circular motion, providing different angles of the appetizing sandwich." + }, + { + "prompt_en": "an orange", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "orange" + } + }, + "refined_prompt": "A vibrant orange sits on a light-colored surface, illuminated by soft, natural lighting that highlights its textured, dimpled skin. The round shape and bright color give it a fresh and appealing appearance. The camera slowly zooms in, capturing the details of the orange's surface, before gently circling around to reveal its full form. Shadows cast by studio lights provide dimension, enhancing the orange's rich hue." + }, + { + "prompt_en": "broccoli", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "broccoli" + } + }, + "refined_prompt": "A fresh, vibrant head of broccoli sits prominently against a plain white background. The broccoli is richly green, with tightly packed florets and a sturdy, textured stalk. The camera slowly zooms in, highlighting the intricate detail of the florets and the natural, organic texture of the stalk. Soft, even lighting enhances the freshness and color, making the broccoli appear crisp and appetizing." + }, + { + "prompt_en": "a carrot", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "carrot" + } + }, + "refined_prompt": "A single, vibrant orange carrot is lying on a wooden kitchen counter, its surface smooth and glossy under the soft, warm lighting. The carrot tapers to a point at one end and has a few bright green leafy tops attached at the other end. The camera slowly zooms in to highlight the carrot's fresh appearance and texture, showcasing its rich color and natural details." + }, + { + "prompt_en": "a hot dog", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "hot dog" + } + }, + "refined_prompt": "A close-up view of a delicious hot dog placed neatly in a soft, slightly toasted bun. The hot dog is topped with a generous drizzle of mustard and ketchup, creating a vibrant and appetizing contrast against the warm tones of the bun. The camera slowly pans across the hot dog, highlighting the texture of the grilled sausage and the glossy sheen of the condiments. The setting features a simple, clean background, keeping the focus on the mouth-watering hot dog. Soft, warm lighting enhances the overall appeal and highlights the tempting details." + }, + { + "prompt_en": "a pizza", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "pizza" + } + }, + "refined_prompt": "A freshly baked pizza is placed on a rustic wooden table. The camera captures a top-down view of the pizza, showcasing its perfectly golden-brown crust, bubbling cheese, and an assortment of vibrant toppings, including fresh basil leaves, ripe tomatoes, and slices of pepperoni. The pizza is still hot, with steam rising gently and the cheese glistening under a warm, soft light. The entire scene conveys a sense of appetizing warmth and inviting aroma." + }, + { + "prompt_en": "a donut", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "donut" + } + }, + "refined_prompt": "A single glazed donut is placed on a small white plate. The donut has a smooth, shiny glaze, giving it an appetizing appearance. The camera moves in for a close-up, highlighting the glossy texture and soft, fluffy dough. The plate rests on a dark wooden table, providing contrast that makes the donut stand out. The lighting is warm, creating an inviting and delicious atmosphere." + }, + { + "prompt_en": "a cake", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "cake" + } + }, + "refined_prompt": "A cake is beautifully presented on a decorative platter. It features a smooth layer of white frosting, topped with a trail of vibrant red and green berries arranged artfully on top. The cake's sides are garnished with delicate sugar flowers, adding a touch of elegance. The camera slowly zooms in to reveal the intricate details of the decorations, capturing the texture of the frosting and the fresh, juicy appearance of the berries. Soft lighting enhances the cake's beautiful presentation, creating a warm and inviting atmosphere." + }, + { + "prompt_en": "a chair", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "chair" + } + }, + "refined_prompt": "A single modern chair is positioned elegantly in the center of a spacious, well-lit room. The chair features sleek metal legs and a plush white cushion, creating a sense of contemporary style and comfort. Soft natural light filters in from a nearby window, casting gentle shadows on the floor around the chair. The camera smoothly circles around the chair, capturing various angles and emphasizing the quality of its design." + }, + { + "prompt_en": "a couch", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "couch" + } + }, + "refined_prompt": "A plush, comfortable couch is positioned in a well-lit living room. The couch features soft, beige fabric and is adorned with a few colorful throw pillows in shades of blue and green. Sunlight streams in from a nearby window, creating a warm and inviting atmosphere in the room. The camera pans slowly from left to right, capturing the details of the couch and the play of light across its surface, highlighting its cozy and inviting nature." + }, + { + "prompt_en": "a potted plant", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "potted plant" + } + }, + "refined_prompt": "A close-up view displays a small, charming potted plant sitting on a wooden table. The plant has vibrant green leaves that fan out gracefully from its center, giving it a fresh and lively appearance. The pot is made of smooth, white ceramic, which contrasts beautifully with the lush greenery of the plant. The camera slowly rotates around the plant, highlighting the delicate texture and natural beauty of the leaves. Soft natural light filters from the side, casting gentle shadows and creating a calm, serene atmosphere." + }, + { + "prompt_en": "a bed", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "bed" + } + }, + "refined_prompt": "A cozy bed is positioned in the center of a softly lit room. The bed is made up with a plush, white comforter and several fluffy pillows arranged at the headboard. The pillows are a mix of different sizes and are all in shades of white and cream, adding to the inviting look. A small bedside table with a warm, dimly lit lamp is situated next to the bed, casting a gentle glow over the scene and enhancing the peaceful atmosphere." + }, + { + "prompt_en": "a dining table", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "dining table" + } + }, + "refined_prompt": "A neatly set dining table is centered in a warmly lit room, creating an inviting atmosphere. The table is covered with a white tablecloth and features elegant place settings for four, including polished silverware and fine china plates. A floral centerpiece adds a touch of color, with vibrant red and white blooms arranged in a clear vase. The camera slowly pans across the table, highlighting the sparkling glassware and neatly folded cloth napkins, before coming to rest at the centerpiece in the middle, capturing the harmonious dining scene." + }, + { + "prompt_en": "a toilet", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "toilet" + } + }, + "refined_prompt": "A pristine white toilet is positioned against a smooth, light-colored wall in a clean and modern bathroom. The toilet has a sleek design with a smooth, rounded lid. The chrome handle on the side gleams slightly under the soft overhead lighting. The camera provides a close-up view, slowly panning from the top of the tank down to the base, highlighting the toilet's immaculate surface and contemporary style." + }, + { + "prompt_en": "a tv", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "tv" + } + }, + "refined_prompt": "A modern flat-screen TV is mounted on a sleek wall in a living room. The screen is turned on, displaying a vibrant, colorful image of a scenic landscape with rolling hills and a clear blue sky. Soft, ambient lighting from the room creates a cozy atmosphere, enhancing the visual experience. The camera subtly pans to the side, revealing a minimalist console table beneath the TV, adorned with a few decorative items like a small plant and a couple of books." + }, + { + "prompt_en": "a laptop", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "laptop" + } + }, + "refined_prompt": "A sleek, modern laptop sits open on a wooden desk, its screen displaying a vibrant desktop wallpaper with a calming ocean view. The keyboard, illuminated by soft white backlighting, highlights its elegant design. The surrounding lighting creates a focused and cozy atmosphere. In front of the laptop, a person's hand hovers over the touchpad, gently moving across it, suggesting interaction with the device. The camera slowly zooms in to showcase the laptop's elegant features and vivid screen." + }, + { + "prompt_en": "a remote", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "remote" + } + }, + "refined_prompt": "A sleek, modern remote is positioned on a smooth, wooden surface. The remote is black with a matte finish and features an array of buttons arranged neatly, including a prominent circular navigation button in the center. The lighting is soft and subtle, highlighting the contours and details of the remote without causing any harsh reflections. The camera starts with a close-up shot of the remote, capturing the details of the buttons, before slowly tilting upwards to show the shape and form of the entire remote against the wooden background." + }, + { + "prompt_en": "a keyboard", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "keyboard" + } + }, + "refined_prompt": "A close-up view of a sleek and modern computer keyboard is shown. The keyboard has a black matte finish with illuminated white backlit keys, creating a soft glow in the dimly lit room. The keys are arranged in a standard QWERTY layout, and the camera slowly pans from left to right along the length of the keyboard, highlighting the smooth texture and the precision of the design. Reflections from the surrounding light subtly play on the glossy surface around the keys, adding to the sophisticated atmosphere." + }, + { + "prompt_en": "a cell phone", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "cell phone" + } + }, + "refined_prompt": "A sleek and modern smartphone lies on a smooth wooden table. The phone's screen is illuminated, displaying a vibrant home screen with colorful app icons. The phone has a thin metallic frame and a glossy finish, reflecting light elegantly. The soft ambient lighting creates a warm, inviting atmosphere. The camera gently pans from left to right, highlighting the phone's design and creating a sense of anticipation as it showcases the screen's vivid colors and the device's sleek form." + }, + { + "prompt_en": "a microwave", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "microwave" + } + }, + "refined_prompt": "A sleek, modern microwave is seen in a kitchen setting. The stainless steel exterior gleams under the bright overhead lighting. The microwave's digital display shows the time, and a glass door reveals an empty interior with a rotating glass plate. The control panel features a series of buttons and a dial for selecting various settings. As the camera slowly pans from left to right, a subtle reflection can be seen on the polished surface of the microwave, enhancing its contemporary design." + }, + { + "prompt_en": "an oven", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "oven" + } + }, + "refined_prompt": "A close-up view of a sleek, modern oven with a stainless steel finish. The oven door has a large, clear glass window revealing the glowing interior where two trays of cookies are baking. Soft, warm light emanates from inside, illuminating the golden-brown cookies. The control panel on the top of the oven features digital displays and dials, showing the temperature and cooking time. The camera smoothly zooms in on the cookies, capturing the delicious texture and slight rise as they bake." + }, + { + "prompt_en": "a toaster", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "toaster" + } + }, + "refined_prompt": "A shiny stainless steel toaster sits on a kitchen countertop, glowing softly under the warm kitchen lighting. The toaster features a simple design with two slots on top and a dial on its front for adjusting browning levels. The camera gradually moves closer, capturing the slight reflection of kitchen lights on its surface. The toaster appears ready for use, creating a cozy and inviting kitchen atmosphere." + }, + { + "prompt_en": "a sink", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "sink" + } + }, + "refined_prompt": "A modern sink is set against a sleek, tiled backsplash. The sink features a gleaming stainless steel basin with a curved, high-arc faucet above it. As the camera pans from left to right, the faucet is turned on, and water flows smoothly from it, creating ripples in the basin. The surrounding countertop is clean and polished, with a small soap dispenser placed to the side, completing the elegant and contemporary look of the scene. The lighting is bright, emphasizing the minimalist and stylish design of the sink area." + }, + { + "prompt_en": "a refrigerator", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "refrigerator" + } + }, + "refined_prompt": "The video focuses on a sleek, stainless steel refrigerator standing in a modern kitchen. The refrigerator's smooth surface reflects the ambient light, creating a polished and sophisticated appearance. The camera begins with a close-up of the brushed metal texture on the door, then zooms out to reveal the entire appliance. The refrigerator's handles are elegantly designed, and the soft lighting enhances its contemporary style. The room's serene atmosphere is complemented by subtle shadows that add depth to the scene." + }, + { + "prompt_en": "a book", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "book" + } + }, + "refined_prompt": "A hardcover book rests on a wooden table, its cover displaying an intricate design in shades of blue and gold. The book is slightly open, revealing a glimpse of the creamy white pages inside, with delicate illustrations visible. Soft, ambient lighting creates a cozy atmosphere, casting gentle shadows around the book. The camera approaches slowly, providing a closer view of the detailed cover design and the elegance of the book's binding." + }, + { + "prompt_en": "a clock", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "clock" + } + }, + "refined_prompt": "A stylish, vintage wall clock is mounted on a textured brick wall. The clock features a round, dark wooden frame with elegant Roman numerals and antique-style clock hands. The clock's face is ivory, with a classic, weathered look. The slender hour and minute hands move with a smooth, steady motion, indicating the passing time. Soft ambient lighting highlights the texture of the brick wall, creating a warm and inviting atmosphere. The video includes a gentle zoom-in towards the clock face, enhancing the detail of its craftsmanship." + }, + { + "prompt_en": "a vase", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "vase" + } + }, + "refined_prompt": "A single elegant vase sits on a wooden table, centered in the frame. The vase is slender with a glossy porcelain finish and decorated with delicate floral patterns in shades of blue. A soft, ambient light highlights the intricate designs and creates subtle reflections on the vase's surface. The camera gently pans around the vase, showcasing its exquisite craftsmanship and elegant form. Behind the vase, a simple, blurred background puts emphasis on the vase's beauty and detail." + }, + { + "prompt_en": "scissors", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "scissors" + } + }, + "refined_prompt": "A pair of sleek, stainless steel scissors is lying on a smooth black surface. The camera starts with a close-up view, highlighting the sharp blades and polished finish. As the camera slowly pans upwards, the ergonomic black handles come into view, emphasizing their modern design and comfortable grip. The light reflects off the metal, creating a subtle gleam that accentuates the scissors' precise craftsmanship. The background remains softly focused, ensuring the scissors are the focal point of the scene." + }, + { + "prompt_en": "a teddy bear", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "teddy bear" + } + }, + "refined_prompt": "A cuddly teddy bear sits on a soft, cream-colored blanket. The bear has plush, light brown fur and shiny black eyes that give it a friendly expression. It wears a red satin ribbon around its neck, adding a charming touch. The camera slowly zooms in, capturing the texture of the bear's fur and the details of its smile. Soft, warm lighting enhances the cozy and inviting atmosphere of the scene." + }, + { + "prompt_en": "a hair drier", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "hair drier" + } + }, + "refined_prompt": "A sleek, modern hair dryer sits elegantly on a bathroom countertop, with a reflective surface beneath it. The hair dryer has a shiny metallic finish with a sleek ergonomic handle and a long, streamlined nozzle. The power cord is neatly coiled beside it. Soft daylight filters in from a nearby window, creating gentle highlights on the hair dryer's surface, enhancing its sophisticated design. The camera slowly circles around the hair dryer to showcase its design and features from different angles." + }, + { + "prompt_en": "a toothbrush", + "dimension": [ + "object_class" + ], + "auxiliary_info": { + "object_class": { + "object": "toothbrush" + } + }, + "refined_prompt": "A close-up view of a toothbrush is displayed, resting on the edge of a white bathroom sink. The toothbrush has a sleek, modern design with a blue handle featuring textured grips. Its bristles are white with blue tips, neatly arranged and slightly flexing as it lies on its side. Soft lighting illuminates the scene, creating gentle reflections on the sink's surface. The camera captures the toothbrush from a slight angle, enhancing its detail and design intricacies." + }, + { + "prompt_en": "a red bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A bright red bicycle is parked on a quiet street against a backdrop of lush greenery. The bike's shiny frame gleams under the soft, natural daylight. It features a classic design with a comfortable black saddle and curved handlebars. The wheels are sleek, with intricate spokes that catch the light. The camera slowly pans from the front wheel up to the handlebars, highlighting the bicycle's vibrant color and clean lines, before settling on a picturesque view of the adjacent green landscape." + }, + { + "prompt_en": "a green bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A sleek green bicycle is parked on a brick street on a sunny day. It features a modern design with curved handlebars and a smooth frame. The sunlight glints off its metallic finish, highlighting the vibrant green color. The bicycle is positioned with its side facing the camera, showcasing its thin tires and well-maintained chain. The handlebars are slightly tilted, suggesting readiness for a ride. Shadows from nearby trees cast gentle patterns on the ground, adding depth to the atmosphere. The camera remains still, focusing on the bicycle as the focal point of the scene." + }, + { + "prompt_en": "a blue bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A blue bicycle stands alone in a sunlit park, its vibrant color shining under the bright afternoon light. The bicycle features a classic design with a sleek frame. The wheels are well-aligned and glisten slightly as the sun catches them. Nearby, lush green grass and a few colorful wildflowers add a touch of natural beauty to the scene. The camera gently pans around, capturing the bicycle from different angles, highlighting its elegant lines and sturdy build against the peaceful park backdrop." + }, + { + "prompt_en": "a yellow bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A bright yellow bicycle is positioned on a sunlit path, with a backdrop of lush green trees and shrubs. The bicycle has a classic design, featuring a basket on the front and a sleek, retro frame. Sunlight filters through the trees, casting dappled patterns on the path and the bicycle, creating a serene and inviting atmosphere. The camera starts with a close-up of the bicycle, slowly panning upward to reveal its full form set against the tranquil natural scene." + }, + { + "prompt_en": "an orange bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange bicycle is parked on a cobblestone city street under soft, ambient lighting. The bicycle has a sleek frame with contrasting black accents on the seat and handlebars. It stands upright with the help of a kickstand, and its wheels are slightly turned to the side, casting a subtle shadow on the cobblestones. The camera pans slowly from the front wheel to the rear, highlighting the vibrant orange color and the polished metal details of the bike. The atmosphere is calm and peaceful, with a hint of urban charm in the background." + }, + { + "prompt_en": "a purple bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A purple bicycle is prominently displayed against a serene urban backdrop. The bicycle features a sleek design with curved handlebars and a comfortable seat. The vibrant purple frame glistens under the gentle afternoon sunlight, enhancing its color. In the background, soft-focus buildings and trees create a peaceful atmosphere. The camera slowly circles the bicycle, highlighting its intricate details, such as the shiny spokes and pedals. The scene conveys a sense of calm and simplicity in an urban setting." + }, + { + "prompt_en": "a pink bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A pink bicycle is parked in a sunlit park, resting on a lush green lawn. The bicycle features a shiny metallic frame with a soft pink hue, complemented by a matching pink seat and handlebars. A wicker basket is attached to the front, adding a touch of charm. Sunlight filters through the surrounding trees, casting a dappled pattern of light and shadow on the ground. Leaves rustle gently in the breeze, creating a serene and idyllic atmosphere. The camera slowly circles the bicycle to capture different angles, emphasizing its elegant and inviting appearance." + }, + { + "prompt_en": "a black bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek black bicycle is parked against a rustic brick wall. The bicycle features a minimalist design, with a matte black frame and thin tires that complement its streamlined appearance. A small leather saddle adds a touch of elegance, and the handlebars are slightly turned to the side, creating a sense of readiness for motion. Sunlight casts soft shadows on the brick wall and highlights the bicycle's curves, creating a warm and inviting atmosphere. The camera gently pans from the front wheel to the back wheel, capturing the bicycle's full length and emphasizing its stylish form." + }, + { + "prompt_en": "a white bicycle", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A white bicycle is parked on a narrow sidewalk. The bicycle's frame is sleek and shiny, with a minimalist design. Its tires are black with a subtle shine, contrasting with the white frame. The handlebars are curved gently upwards, and the seat is black with a smooth surface. The bicycle is leaning against a small wooden post, with a scenic background of greenery blurred softly in the distance. The camera slowly zooms in, focusing on the bicycle's intricate details and reflecting the serene surroundings." + }, + { + "prompt_en": "a red car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A sleek red car is parked in a vibrant urban setting, surrounded by tall buildings and city lights. The car's glossy paint reflects the surrounding lights, accentuating its curves and design. The camera slowly pans from the front of the vehicle, showcasing its detailed grille and sleek headlights, before moving to the side, highlighting the car's smooth lines and sporty profile. The atmosphere is lively, with the city lights creating a vivid and energetic backdrop." + }, + { + "prompt_en": "a green car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A sleek green car is parked on a quiet street lined with trees. The car's shiny surface reflects the dappled sunlight filtering through the leaves above. The camera slowly circles the vehicle, highlighting its vibrant paint job and smooth, aerodynamic design. The trees in the background add a natural, serene atmosphere, while the gentle rustling of leaves complements the scene's tranquility." + }, + { + "prompt_en": "a blue car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A sleek blue car is parked on a quiet street, surrounded by lush green trees. The sky above is bright and clear, casting a soft light that reflects off the car's glossy surface. The car's design is modern and sporty, with smooth, aerodynamic lines. The front of the car is angled slightly towards the camera, showcasing its shiny grille and stylish headlights. The atmosphere is serene, and the car's vibrant blue color contrasts beautifully with the natural surroundings. The camera gently pans from left to right, capturing the car's elegant form and the tranquil setting." + }, + { + "prompt_en": "a yellow car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A bright yellow car is parked diagonally on a sunlit street, its glossy surface reflecting the mid-morning sun. The car's sleek design, with smooth curves and shining chrome accents, highlights its modern style. A gentle breeze causes nearby trees to cast subtle, shifting shadows on the vehicle. The camera pans slowly from the front of the car to the side, capturing the striking color and the car's elegant lines." + }, + { + "prompt_en": "an orange car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange car is parked in a scenic mountain setting, its glossy paint reflecting the sunlight. The car's sleek design features distinctive curves and sharp lines that give it a sporty look. The camera slowly approaches the car from the side, highlighting its elegant silhouette and shiny hubcaps. In the background, majestic mountains rise against a clear blue sky, adding a sense of adventure to the scene. The car sits on a narrow, winding road, suggesting a potential for thrilling drives." + }, + { + "prompt_en": "a purple car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A sleek purple car is parked on a suburban street, its glossy finish reflecting the surroundings. The sunlight catches the curves of the car, enhancing its vibrant color. The car features a modern and aerodynamic design, with tinted windows and sporty rims. The camera slowly moves around the car, capturing different angles and highlighting its elegant lines against the backdrop of a quiet, tree-lined street." + }, + { + "prompt_en": "a pink car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A pink car is parked in a vibrant urban setting, bathed in golden sunlight. The car's sleek and shiny exterior gleams, catching the light and reflecting the surroundings. It's positioned at a slight angle to the camera, showcasing its elegant design. The headlights are stylish and modern, adding to the car's charm. The background features colorful buildings and busy street activity, enhancing the lively atmosphere." + }, + { + "prompt_en": "a black car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek black car is parked on a quiet street, showcasing its polished exterior. The car has a shiny, reflective finish that gleams under the soft ambient light. The camera begins with a wide shot to capture the entire vehicle, and then smoothly zooms in to highlight its aerodynamic curves and stylish wheels. The background features a few trees and a sidewalk, adding a touch of tranquility to the scene. The camera's focus on the vehicle draws attention to its elegance and modern design." + }, + { + "prompt_en": "a white car", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A sleek white car is parked on a smooth asphalt road, with a backdrop of a lush forest creating a serene atmosphere. The car's glossy paint reflects the sunlight, emphasizing its streamlined design and clean lines. The camera slowly circles around the car, showcasing its elegant curves and shiny alloy wheels. The forest provides a peaceful and natural setting, with sunlight filtering through the leaves to create dappled patterns on the ground." + }, + { + "prompt_en": "a red bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A vivid red bird is perched on a thin branch surrounded by lush green leaves. Its feathers are strikingly vibrant, catching the sunlight that filters through the foliage, creating a rich contrast against its surroundings. The bird turns its head slightly, showcasing its sharp beak and alert eyes. The camera gently zooms in, focusing on the bird as it occasionally flutters its wings, making subtle rustling sounds among the leaves. The atmosphere is peaceful, with a soft breeze that lightly sways the branch." + }, + { + "prompt_en": "a green bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A vibrant green bird is perched on a slender tree branch, surrounded by lush foliage. The bird's feathers glisten under the gentle sunlight filtering through the leaves. It tilts its head slightly to the side, showcasing its delicate beak and bright eyes. The bird flutters its wings briefly, causing a soft rustling of leaves around it. The camera gently zooms in, capturing the bird's intricate details and the serene natural setting." + }, + { + "prompt_en": "a blue bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A vibrant blue bird is perched gracefully on a slender tree branch. The bird's feathers glisten in the sunlight, showcasing a striking shade of blue. It has a small, delicate beak and sharp eyes that give it an alert expression. The bird occasionally tilts its head, adding a sense of curiosity to its demeanor. The background is a blur of green foliage, providing a natural contrast that enhances the bird's beautiful color. The camera slowly pans around, capturing different angles of the bird and creating a peaceful and serene atmosphere." + }, + { + "prompt_en": "a yellow bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A vibrant yellow bird is perched on a delicate branch. The bird's feathers glisten in the soft sunlight, showcasing its lively yellow hue. The branch gently sways as the bird shifts its position, revealing its small, attentive eyes and sleek beak. The background is a soft blur of green foliage, adding to the serene and natural atmosphere. The camera slowly zooms in to highlight the intricate details of the bird's feathers and its delicate movements." + }, + { + "prompt_en": "an orange bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange bird perches gracefully on a delicate tree branch, surrounded by vibrant green leaves. The bird's feathers are a vivid orange, shining brilliantly in the soft, dappled sunlight filtering through the foliage above. Its small, round eyes survey the surroundings attentively, while its beak is slightly open, as if ready to chirp. The scene is peaceful and serene, with the gentle motion of leaves swaying in the breeze. The camera slowly zooms in, capturing the bird's intricate feather details and the natural beauty of its environment." + }, + { + "prompt_en": "a purple bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A vibrant purple bird with striking plumage is perched on a delicate branch. The bird's feathers shimmer with various shades of purple under the gentle sunlight. It sits calmly, occasionally turning its head to survey the surroundings. The camera smoothly zooms in on the bird, capturing its graceful stance and perfectly shaped beak. The branch sways slightly in the soft breeze, enhancing the serene and natural ambiance of the scene. The background is a blurred mix of green leaves, emphasizing the bird's vivid color." + }, + { + "prompt_en": "a pink bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A vibrant pink bird is perched on a slender tree branch, set against a backdrop of lush green foliage. The bird's feathers gleam in the soft, dappled sunlight filtering through the leaves. It has a smooth, sleek plumage and a delicate beak. The bird occasionally turns its head and flicks its tail gracefully. The camera slowly zooms in, capturing the intricate details of the bird's feathers and the serene ambiance of the surrounding nature." + }, + { + "prompt_en": "a black bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek black bird is perched gracefully on a thin branch, its glossy feathers shimmering in the sunlight. The bird tilts its head slightly, showcasing its sharp, inquisitive eyes and sleek beak. The surrounding leaves gently sway in the breeze, creating a peaceful atmosphere. The camera slowly zooms in, capturing the intricate details of the bird's plumage and the subtle movements of the branch." + }, + { + "prompt_en": "a white bird", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A white bird is gracefully soaring through the sky with its wings fully extended. The bird's feathers glisten in the sunlight as it glides effortlessly, creating a sense of elegance and freedom. The sky is a clear blue with a few scattered clouds, adding depth to the scene. The camera follows the bird's smooth flight, capturing the subtle play of light and shadow across its feathers. As the bird moves from left to right, the camera pans slightly to maintain focus." + }, + { + "prompt_en": "a black cat", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek black cat is sitting gracefully on a wooden windowsill, basking in soft, warm sunlight streaming through the window. The feline's shiny black fur contrasts beautifully with the natural wood of the sill. Its eyes, a striking bright yellow, are half-closed in contentment as it gazes out, possibly spotting a bird or passing leaves. The cat's ears twitch slightly, and its long tail gently sways back and forth. The camera captures a close-up of the cat from the side, enhancing the serene and cozy atmosphere of the scene." + }, + { + "prompt_en": "a white cat", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A charming white cat with soft, fluffy fur is gracefully sitting on a sunny window sill. Sunlight pours through the window, casting a warm, golden glow on its fur, highlighting its elegant, pristine appearance. The cat is gazing outside with mesmerizing blue eyes, its tail gently curling around its paws. There are a few delicate shadows cast by nearby plants, adding a subtle touch of nature to the scene. The camera captures this serene moment with a gentle zoom, enhancing the tranquil atmosphere." + }, + { + "prompt_en": "an orange cat", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange cat with a sleek coat and bright, inquisitive eyes sits gracefully on a windowsill. The cat gazes outside with one paw gently resting on the sill, its tail wrapped neatly around its body. Sunlight filters through the window, highlighting the cat's vibrant fur and creating a warm, cozy atmosphere. The camera captures a close-up view of the cat as it turns its head slightly, showcasing its serene and charming expression." + }, + { + "prompt_en": "a yellow cat", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A fluffy yellow cat sits calmly on a windowsill, gazing outside with its curious amber eyes. The sun shines through the window, casting a warm, soft glow on its fur, enhancing its bright yellow hue. The cat's tail is gently wrapped around its paws, and its ears are perked up, giving it an alert yet relaxed posture. The camera slowly zooms in to highlight the cat's serene expression and the delicate details of its fur as it enjoys its peaceful spot by the window." + }, + { + "prompt_en": "a red umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A vibrant red umbrella is seen spinning gently in the air against a clear blue sky. The umbrella's rich red fabric catches the sunlight, creating bold, striking reflections. Its metal ribs and handle provide contrast, revealing intricate details as it spins. The camera slowly rotates around the umbrella, enhancing the serene and whimsical moment. The cheerful color and simple motion create a joyful and airy atmosphere." + }, + { + "prompt_en": "a green umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A close-up of a vibrant green umbrella is captured as it twirls gently under a light drizzle. The umbrella's fabric has a smooth, satin-like texture, and raindrops create a shimmering effect as they glisten on its surface. As the camera pans around the umbrella, the curved metal spokes are visible, holding its structure firmly. The soft, overcast lighting enhances the umbrella's bright color against the muted background of the rainy atmosphere." + }, + { + "prompt_en": "a blue umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A single blue umbrella is open and positioned on a sandy beach, providing shade from the sun. Its fabric is a vibrant, deep blue, and the metal ribs underneath glimmer slightly in the sunlight. The umbrella casts a small shadow on the sand, and the beach stretches out with gentle waves visible in the distance. The sky is clear with a few wispy clouds, adding to the serene and relaxing atmosphere. The umbrella is centered in the frame as the camera slowly moves closer, capturing its details and the peaceful beach setting." + }, + { + "prompt_en": "a yellow umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A bright yellow umbrella is open and held by an unseen person just off-frame, shielding against a gentle rain. The raindrops lightly tap the umbrella's surface, causing small ripples to spread across the taut fabric. The yellow of the umbrella stands out vibrantly against the soft, muted gray of the rainy sky in the background. The camera slowly zooms in, capturing the texture and color of the umbrella as well as the movement of the raindrops." + }, + { + "prompt_en": "an orange umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange umbrella is open, with its vibrant fabric catching sunlight, creating a vibrant and cheerful ambiance. The camera slowly rotates around the umbrella, highlighting its bright color and the texture of its fabric. The umbrella's metal ribs glisten in the light, adding a touch of elegance to the scene. The background features a clear blue sky, providing a striking contrast to the orange hue of the umbrella. The atmosphere is peaceful and summery." + }, + { + "prompt_en": "a purple umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A single purple umbrella is open and gently rotating in the wind on a grassy field. The umbrella has a smooth, elegant fabric with a glossy finish that catches the natural light, creating subtle highlights on its surface. Its handle is sleek and black, contrasting with the vibrant purple canopy. The camera slowly zooms in, focusing on the intricate design of the umbrella's ribs and the way the fabric ripples slightly in the breeze. The background is softly blurred, emphasizing the beauty of the umbrella against the greenery." + }, + { + "prompt_en": "a pink umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A pink umbrella is opened and twirling gently, with its canopy catching the light from above. The umbrella’s panels are a bright, cheerful pink, and it has a sleek, shiny handle. As it turns, subtle shadows and highlights dance across the fabric, adding depth and texture. The camera smoothly zooms in, focusing on the flowing motion of the umbrella's rotation against a soft, blurred background. The atmosphere is lighthearted and whimsical." + }, + { + "prompt_en": "a black umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A black umbrella is open and positioned against a backdrop of a cloudy sky. The fabric of the umbrella is taut, and its sleek, matte finish contrasts with the overcast sky. The umbrella is held aloft, allowing the breeze to gently ripple across its surface. The camera slowly pans upward from the bottom of the umbrella to the top, capturing the umbrella's structure and its elegant, curved handle. The atmosphere is calm and serene, with soft lighting highlighting the umbrella against the muted background." + }, + { + "prompt_en": "a white umbrella", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A pristine white umbrella is open, displaying its elegant structure beneath soft ambient lighting. The umbrella is positioned with its canopy fully spread, showcasing its smooth, unblemished surface. The handle, which is sleek and polished, extends gracefully downward, accentuating the simplicity and sophistication of the design. The background is softly blurred, allowing the umbrella to stand out as the central focus of the scene. The camera gently pans around the umbrella, highlighting its form and the gentle play of light across the fabric." + }, + { + "prompt_en": "a red suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A vibrant red suitcase stands upright on a smooth, glossy surface. Its hard shell glistens under a bright, focused spotlight, enhancing its sleek appearance. The suitcase features a telescopic handle extended halfway and sturdy wheels at the base. The camera slowly zooms in, capturing the gleaming texture and bold color of the suitcase. The background fades into a soft blur, emphasizing the suitcase as the centerpiece of the scene." + }, + { + "prompt_en": "a green suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A sturdy, bright green suitcase stands upright on a smooth, tiled floor. The suitcase has a hard-shell exterior with a glossy finish that catches the light, reflecting its vibrant color. It features a pair of silver zippers running along its edge and a telescopic handle extended at the top. The suitcase's four wheels are visible at the base, angled slightly as if ready to roll. The camera pans slowly around the suitcase, offering a comprehensive view of its sleek design and sturdy construction." + }, + { + "prompt_en": "a blue suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A vibrant blue suitcase stands upright against a clean, white backdrop, highlighting its sleek design. The suitcase features a textured outer shell with a subtle shine, adding to its modern, stylish appearance. It is equipped with smooth-rolling wheels and an adjustable telescopic handle, both made from polished metal. The camera slowly pans from the top of the handle to the bottom, showcasing the suitcase's compact and functional design." + }, + { + "prompt_en": "a yellow suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A vibrant yellow suitcase is positioned upright on a glossy airport floor. The suitcase has a sleek, modern design with smooth, rounded edges and a textured surface that catches the light in the room. The handle of the suitcase is extended, ready for travel. The camera captures the scene from a slightly elevated angle, slowly zooming in to showcase the suitcase’s details and rich color. In the background, there are blurred figures of travelers and the outlines of departure screens, providing a sense of bustling activity in the airport." + }, + { + "prompt_en": "an orange suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange suitcase stands upright on a smooth, reflective surface. The suitcase is sleek and modern, with a hard shell and a glossy finish. It features a sturdy handle and four small, multi-directional wheels at its base. The suitcase's vibrant color catches the light, highlighting its textured design. The camera slowly circles around the suitcase, revealing its sides and capturing the way light plays off its surface, creating a sense of anticipation and travel excitement." + }, + { + "prompt_en": "a purple suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A purple suitcase stands upright on a smooth, glossy surface. The suitcase is medium-sized, with a hard shell and a glossy finish that reflects light. It has a sleek design with a textured pattern on the surface. The suitcase is equipped with a retractable handle and four small wheels at the base, allowing for easy movement. The camera slowly pans around the suitcase, highlighting its vibrant color and modern features. The lighting emphasizes the suitcase's texture and design, creating a stylish and appealing atmosphere." + }, + { + "prompt_en": "a pink suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A pink suitcase stands upright on a flat surface. It features a glossy finish and a compact, travel-friendly size. The suitcase is equipped with smooth-rolling wheels at the base and a retractable metal handle that extends upwards. The suitcase has visible stitch details along the edges, giving it a stylish and modern appearance. The camera slowly circles the suitcase, highlighting its sleek design and the soft, gentle shine on its surface as light reflects off of it." + }, + { + "prompt_en": "a black suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek black suitcase stands upright on a polished wooden floor. Its surface is shiny and smooth, with a subtle sheen reflecting soft ambient lighting. The suitcase has sturdy wheels at its base and a telescopic handle extended upwards, ready for use. The camera smoothly pans from top to bottom, highlighting the suitcase's elegant design and durable zippers on the front. The atmosphere is calm, and the suitcase appears ready for travel." + }, + { + "prompt_en": "a white suitcase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A white suitcase is positioned upright on a smooth, light-colored surface. It has a textured, glossy finish that catches the light, creating a sleek and modern appearance. The suitcase features sturdy, metallic handles and smoothly rolling wheels at its base. Its zippers and locks are finely detailed, reflecting the ambient light in the scene. The camera smoothly rotates around the suitcase, offering a comprehensive view of its design and features. The neutral background enhances the suitcase's sophisticated and stylish look." + }, + { + "prompt_en": "a red bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A vibrant red bowl sits on a wooden kitchen counter, its glossy surface reflecting the warm, ambient light. The bowl is empty, with a smooth, rounded shape and a slightly flared rim. The camera captures the bowl from a slight angle above, slowly panning around it to showcase its rich color and flawless finish. The wooden texture of the counter underneath adds a rustic contrast to the bowl’s bold hue, creating a cozy and inviting atmosphere." + }, + { + "prompt_en": "a green bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A vibrant green bowl is positioned on a light wooden table, bathed in soft, natural light. The bowl is smooth and glossy, reflecting the light off its surface. The camera pans slowly around the bowl, capturing its curves and elegant, symmetrical shape. The atmosphere is calm and inviting, highlighting the tranquil simplicity of the single, well-crafted object against the understated background." + }, + { + "prompt_en": "a blue bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A close-up view of a blue bowl resting on a wooden surface. The bowl, smooth and glossy, reflects a soft light, emphasizing its vibrant blue color. The camera captures the bowl from a top-down angle, revealing its empty interior. Soft shadows create a gentle, calm ambiance around the bowl, complementing the natural texture of the wood below. The camera slightly zooms in to accentuate the simplicity and elegance of the bowl." + }, + { + "prompt_en": "a yellow bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A bright yellow bowl sits on a wooden table, its smooth surface reflecting light and highlighting its vibrant color. The bowl is empty, and its round shape and glossy finish stand out against the darker tones of the wood beneath it. The camera circles around the bowl, slowly capturing its symmetry and simple beauty from different angles, while the soft ambient lighting creates subtle shadows around the edges." + }, + { + "prompt_en": "an orange bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange bowl is centered on a smooth wooden table. The bowl is vibrant in color and perfectly round, with a glossy finish that reflects the soft lighting in the room. The camera slowly rotates around the bowl, capturing its rich hue and flawless shape from different angles. The warm lighting enhances the inviting and cheerful atmosphere, and subtle shadows cast by the bowl add depth to the scene." + }, + { + "prompt_en": "a purple bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A purple ceramic bowl is placed on a wooden table. The craftsmanship of the bowl is immaculate, with smooth and glossy surfaces that catch and reflect light. The rich purple hue stands out vividly against the warm tones of the wooden table. The camera slowly circles around the bowl, capturing the beautiful interplay of light and shadow on its surface, and emphasizing its elegant curves and overall shape." + }, + { + "prompt_en": "a pink bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A pink bowl is centered on a wooden surface, illuminated by soft natural light that highlights its smooth texture. The camera slowly circles around the bowl, showcasing its rounded shape and glossy finish. The pink color of the bowl stands out against the muted tones of the wooden surface, creating a pleasing visual contrast. The atmosphere is calm and serene, with the gentle camera movement adding subtle dynamism to the scene." + }, + { + "prompt_en": "a black bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek black bowl is placed on a white marble countertop. Its surface is smooth and glossy, reflecting light in delicate highlights. The bowl is empty, allowing its deep black color to stand out against the contrasting background. The camera slowly circles around the bowl, emphasizing its simple yet elegant design. The lighting in the scene is soft and ambient, creating a serene and minimalist atmosphere." + }, + { + "prompt_en": "a white bowl", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A pristine white ceramic bowl is placed on a glossy wooden table, reflecting the soft ambient light in the room. The bowl has a smooth, round shape with a subtle glossy finish that enhances its elegance. The camera captures the bowl from a slightly elevated angle, showcasing its simplicity and clean lines while focusing on the bowl's gentle curves. The surrounding atmosphere is calm, with warm, natural light illuminating the scene, adding a touch of warmth to the setting." + }, + { + "prompt_en": "a red chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A bright red chair is positioned in the center of a minimalist room. The chair has an elegant, modern design with sleek lines and a smooth, glossy finish. The lighting is soft, casting gentle shadows on the floor around the chair, which emphasizes its vibrant color and stylish form. The camera smoothly circles the chair, capturing its striking appearance from multiple angles, showcasing the chair's inviting and contemporary appeal within the simple setting." + }, + { + "prompt_en": "a green chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A vibrant green chair is positioned in the center of a softly lit room. The chair features a sleek and modern design, with smooth, curved lines and a plush fabric upholstery. Its legs are made of polished wood, adding a touch of elegance. The camera captures a gentle pan, starting from one side of the chair and moving slowly around it to reveal its full structure and the way it stands out against the minimalist background." + }, + { + "prompt_en": "a blue chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A single blue chair sits alone on a polished wooden floor, bathed in gentle, natural light streaming in from a nearby window. The chair, featuring a sleek design with a smooth, curved backrest and sturdy legs, is positioned at an angle that highlights its modern contours. The camera pans slowly around the chair, capturing the soft shadows and subtle texture of the fabric, creating a serene and inviting atmosphere." + }, + { + "prompt_en": "a yellow chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A bright yellow chair is positioned on a wooden floor, illuminated by soft, natural sunlight streaming through a nearby window. The chair has sleek, modern lines with a cushioned seat and backrest, inviting comfort. The camera moves slowly from a side angle towards the front of the chair, highlighting its vibrant color and clean design against the warm tone of the wooden floor. Shadows from the window create an interesting pattern on the floor, adding depth to the scene." + }, + { + "prompt_en": "an orange chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange chair with sleek, modern design elements is placed against a neutral background. The chair features smooth, curved lines and is crafted from glossy plastic, reflecting light softly off its surface. The wooden legs add a touch of natural contrast, offering a stylish blend of materials. The camera slowly circles around the chair, providing a full view of its design and highlighting its vibrant color. The atmosphere is bright and minimalistic, emphasizing the chair as the focal point." + }, + { + "prompt_en": "a purple chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A stylish purple chair is placed against a plain white wall, creating a striking contrast. This modern chair features plush cushioning and sleek armrests, exuding elegance and comfort. The camera slowly pans from left to right, capturing the chair's contours and the subtle sheen of its fabric under soft, diffused lighting. Shadows gently play on the floor, adding depth to the serene atmosphere." + }, + { + "prompt_en": "a pink chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A stylish pink chair sits elegantly in the center of a brightly lit room. The chair features a plush, velvety texture with a modern design, complemented by sleek, gold-colored legs. The soft, inviting pink hue adds a touch of charm to the minimalistic setting. The lighting highlights the chair's vibrant color and luxurious fabric, creating a warm and inviting ambiance. The camera slowly pans around the chair, revealing its details and design from different angles." + }, + { + "prompt_en": "a black chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "The scene presents a sleek black chair positioned in the center of a modern, minimalistic room. The chair's smooth leather surface is accentuated by subtle lighting from above, creating gentle highlights on its contours. The four sturdy legs of the chair have a metallic finish, providing an elegant contrast to the deep black of the seat and backrest. The camera slowly zooms in, capturing the chair's polished appearance and the simplicity of its design. The background remains softly blurred, emphasizing the chair as the focal point of the scene." + }, + { + "prompt_en": "a white chair", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A sleek white chair is positioned on a polished wooden floor. The chair has a modern design with a smooth, curved backrest and four slender legs. Natural light softly illuminates the chair from a nearby window, casting gentle shadows on the floor. The camera slowly pans from left to right, showcasing the chair's elegant simplicity and minimalist style. The atmosphere is calm and serene, enhancing the chair's pristine appearance." + }, + { + "prompt_en": "a red clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A vibrant red clock is mounted on a plain white wall. The clock is round, with bold white numbers and sleek black hands that stand out against the red background. The minute and hour hands move smoothly, and the second hand ticks steadily, indicating the passage of time. The minimalistic setting and the intense color of the clock create a striking contrast, drawing attention to its simple yet eye-catching design. The camera smoothly zooms in, emphasizing the clock's vivid hue and precise details." + }, + { + "prompt_en": "a green clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A green clock is mounted on a softly lit wall, casting subtle shadows around its edges. The clock has a round shape with a smooth, vibrant green face, featuring large, white numbers marking the hours. Its sleek, silver hands tick steadily, indicating the current time. The atmosphere is calm and minimalist, with the ticking sound subtly audible in the background. The camera gently zooms in, focusing on the movement of the second hand as it makes its way around the clock face." + }, + { + "prompt_en": "a blue clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A close-up shot of a blue clock is featured, with its round face prominently displayed. The clock has a smooth, vibrant blue casing accompanied by a white face and sleek black hands. Bold black numerals mark each hour clearly. The second hand moves in steady, rhythmic ticks, providing a sense of precision and reliability. Soft ambient light highlights the clock's glossy surface, creating subtle reflections and enhancing its attractiveness. The camera remains fixed on the clock, allowing a clear and detailed view of its design and movement." + }, + { + "prompt_en": "a yellow clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A bright yellow clock is mounted on a white wall, creating a vibrant contrast. The clock is round with bold black numerals and matching black hands, clearly indicating the time. The smooth yellow surface of the clock face shines under soft overhead lighting. The camera slowly zooms in, focusing on the steady movement of the second hand, highlighting the passage of time and the clock's detailed features." + }, + { + "prompt_en": "an orange clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An orange clock with a clean, modern design is displayed against a neutral background. It features a bright orange rim and minimalist black hour and minute hands. The clock face is white, with simple black numbers clearly marking the hours. The camera smoothly zooms in, gradually focusing on the center of the clock, highlighting the precise ticking motion of the second hand. The lighting is soft, casting gentle shadows that emphasize the clock's vibrant color and sleek shape." + }, + { + "prompt_en": "a purple clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A close-up view of a purple clock hanging on a wall, its sleek, glossy surface catching the light. The camera slowly zooms in to capture the details of the clock's minimalist design. The clock has bold, white numerals and elegant silver hands that stand out against the purple background. Its second hand moves smoothly in a continuous motion, adding a sense of tranquility to the scene. The lighting creates subtle reflections on the clock's surface, enhancing its vibrant hue." + }, + { + "prompt_en": "a pink clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A pink clock is featured against a simple background, illuminated by soft, even lighting that enhances its vibrant color. The clock has bold, black numerals and sleek, silver hands that contrast strikingly with the pink face. The second hand moves smoothly in a clockwise direction, indicating the passage of time. The camera gently zooms in to highlight the clock's detailed features, including its glossy finish and elegant design." + }, + { + "prompt_en": "a black clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek, black clock hangs on a plain white wall, creating a minimalist and modern contrast. The clock's glossy black frame encircles a simple, elegant face with slim hour and minute hands, which are silver in color. The numerals on the clock are clear and easy to read, adding to its sophisticated appearance. The camera slowly zooms in on the clock, capturing the smooth movement of the second hand as it ticks. The lighting is soft and even, highlighting the clock's polished finish." + }, + { + "prompt_en": "a white clock", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A white clock with clean, minimalist design is mounted on a plain wall. Its circular frame surrounds a white face, with simple black hour and minute hands pointing at the current time. Black numerals clearly mark each hour. The seconds hand ticks steadily, creating a sense of calm and order. Soft ambient lighting casts a gentle shadow behind the clock, enhancing the simplicity and elegance of its design. The camera focuses directly on the clock's face, highlighting the smooth movement of the hands." + }, + { + "prompt_en": "a red vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "red" + } + }, + "refined_prompt": "A vibrant red vase stands on a sleek, modern table. The vase has a smooth, glossy finish that reflects the light, highlighting its elegant curves and simple, classic shape. The camera gently zooms in, emphasizing the richness of the red color and the vase's polished surface. The background is minimalistic, with soft lighting that adds a subtle warmth to the scene, drawing attention to the vase as the focal point." + }, + { + "prompt_en": "a green vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "green" + } + }, + "refined_prompt": "A green vase is placed on a wooden surface, with soft lighting that highlights its smooth, glossy texture. The vase has a sleek and modern design, with a narrow neck that gradually widens toward the base. The light creates subtle reflections on its surface, emphasizing its elegant curves. Behind the vase, a blurred background adds depth to the scene, while maintaining focus on the vase's vibrant green hue. The camera gently pans around the vase, capturing its form from different angles." + }, + { + "prompt_en": "a blue vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "blue" + } + }, + "refined_prompt": "A glossy blue vase is elegantly placed on a polished wooden table. The vase features a sleek, curvaceous design with a smooth surface that reflects light beautifully. Positioned in the center of the table, it stands out with its vivid blue color against a softly blurred background. The camera slowly zooms in, highlighting the vase's elegant curvature and glossy finish, enhancing its stylish and sophisticated appeal." + }, + { + "prompt_en": "a yellow vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "yellow" + } + }, + "refined_prompt": "A vibrant yellow vase is placed on a minimalist wooden table, positioned to be the centerpiece of the scene. The vase has a smooth, glossy surface that reflects the ambient light, creating a warm and inviting atmosphere. It features a gently curved, classic shape that tapers elegantly towards the neck, showcasing its simple yet striking design. A few fresh green leaves or delicate white flowers emerge from the top, adding a touch of nature and contrast to the vivid yellow of the vase. The camera slowly pans around the vase, capturing its graceful contours and the interplay of light on its surface." + }, + { + "prompt_en": "an orange vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "orange" + } + }, + "refined_prompt": "An elegant orange vase stands gracefully on a wooden tabletop. The vase features a glossy finish that captures the ambient light, creating reflections that highlight its smooth, curvaceous form. The tabletop is set against a neutral-colored background, which complements the vibrant hue of the vase. The camera slowly zooms in to reveal intricate detailing along the vase's surface, enhancing the warmth and richness of its color." + }, + { + "prompt_en": "a purple vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "purple" + } + }, + "refined_prompt": "A purple vase with an elegant, glossy finish sits on a smooth wooden table. The vase has a narrow neck and widens towards the base, showcasing its graceful curves and polished surface. The light source above creates highlights that accentuate the vase's rich color and texture. The camera slowly moves around the vase, capturing its form from different angles, while the warm tones of the wooden table complement the vibrant hue of the vase." + }, + { + "prompt_en": "a pink vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "pink" + } + }, + "refined_prompt": "A delicate pink vase stands elegantly on a wooden table. The vase has a smooth, glossy surface with a slightly flared top and a rounded base. Soft natural light illuminates the vase, highlighting its subtle curvature and pastel color. A single white flower with green leaves peeks out from the vase, adding a touch of contrast and freshness to the scene. The camera moves gently around the vase, capturing its shape and the play of light on its surface." + }, + { + "prompt_en": "a black vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "black" + } + }, + "refined_prompt": "A sleek, glossy black vase is placed on a wooden table, reflecting light softly across its smooth surface. The vase has a classic, elegant shape that widens at the bottom and tapers gently towards the top. In the background, there is a softly blurred hint of a light-colored wall, giving the scene a neutral and calming atmosphere. The camera slowly circles around the vase, showcasing its shiny finish and graceful contours." + }, + { + "prompt_en": "a white vase", + "dimension": [ + "color" + ], + "auxiliary_info": { + "color": { + "color": "white" + } + }, + "refined_prompt": "A pristine white vase is sitting on a wooden table, its smooth ceramic surface gleaming under soft, ambient lighting. The vase, with its elegant, narrow neck and wider base, exhibits a simple yet graceful design. The camera slowly circles around the vase, highlighting its polished finish and the subtle shadows created by the light. The background is slightly blurred, emphasizing the vase as the focal point of the scene." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "The video captures a beautiful coastal beach in the vibrant colors and swirling patterns reminiscent of Van Gogh's style. The scene showcases dynamic, wave-like brushstrokes depicting the ocean waves gently lapping against the sandy shore. The rich blues of the sea contrast with the golden hues of the sand, creating a lively and textured landscape. The spring atmosphere is enhanced by dashes of vibrant green brush strokes in the vegetation that line the beach. The camera slowly pans across the scene, highlighting the intricate patterns and vivid colors, creating a mesmerizing visual experience." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "An oil painting captures a beautiful coastal beach in spring, with gentle waves lapping against the soft sand. The vibrant colors depict a lively and serene atmosphere, as the pastel blue and turquoise waves gently roll onto the shore. The sandy beach is adorned with delicate brush strokes that highlight the natural beauty of the coast. In the background, a clear blue sky with the warm glow of sunlight brings an inviting warmth to the scene. The painting is styled with smooth textures to convey a peaceful and timeless moment by the sea." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "In the delicate style of Ukiyo-e, inspired by Hokusai, a beautiful coastal beach is captured during the vibrant season of spring. The scene depicts gentle waves rhythmically lapping onto the golden sand, creating graceful arcs of white foam. The soft pastel colors of the sky convey a serene atmosphere, with hues of pink and blue reflecting the mild spring weather. Stylized details of the waves and the coastline showcase the elegance and fluidity of traditional Japanese art, highlighting the tranquil beauty of the natural seascape. The composition evokes a sense of harmony and timelessness, inviting viewers to appreciate the subtle complexity of the scene." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "The video presents a stunning black-and-white view of a coastal beach in spring. Gentle waves rhythmically lap onto the sandy shore, creating a serene and timeless atmosphere. The subtle textures of the sand and the soft ripples of the ocean are emphasized by the monochrome palette. The camera glides smoothly parallel to the shoreline, capturing the expansive beauty of the scene while maintaining a tranquil and classic feel." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "A serene coastal beach scene is depicted in pixel art, characterized by its vibrant spring palette. The beach features soft, golden sand with gentle waves lapping against the shore, creating a rhythmic pattern. Small shells and pebbles are scattered along the sand, adding detail to the landscape. The water is a deep blue, transitioning to lighter shades as it meets the beach. The sky is clear, with only a few pixelated clouds, suggesting a pleasant, sunny day. The atmosphere is calm and picturesque, capturing the tranquility of a spring day by the seaside. The camera maintains a steady overhead view, ensuring all elements are perfectly framed within the scene." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "The scene depicts a stunning coastal beach captured in a cyberpunk style, with neon lights subtly illuminating the spring ambiance. Waves rhythmically lap onto the sandy shore, each crest reflecting hues of purple and blue from the futuristic skyline in the background. The beach is adorned with glowing digital flora, casting an ethereal light across the sands. The atmosphere is vibrant and surreal, with a mix of natural beauty and technological elements creating an otherworldly, yet captivating, blend. The camera pans slowly along the shoreline, highlighting the interplay of colors and textures." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, a beautiful coastal beach is depicted in a vibrant spring setting. The scene features gently rolling waves that lap rhythmically against the golden sand. The sky is a brilliant blue, dotted with soft, fluffy clouds, adding to the idyllic atmosphere. The sun casts a warm, inviting glow over the entire scene, highlighting the vivid colors. Seagulls can be seen soaring gracefully in the sky, enhancing the lively, animated vibe of the beach." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "The video is a dreamy depiction of a beautiful coastal beach in spring, reminiscent of a watercolor painting. Gentle waves lap at the sunlit sand, creating a rhythmic and soothing sound. The colors are soft and vibrant, with blues of the ocean blending harmoniously with the golden hues of the sand and hints of green from nearby vegetation. The camera gently pans from left to right, capturing the fluid movements of the waves and the serene atmosphere of the beach. The overall scene evokes a sense of tranquility and natural beauty." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "In this surrealism-style scene, a beautiful coastal beach is depicted in the vibrant setting of spring. The shoreline stretches across the frame, with gentle waves lapping rhythmically onto the sand, creating intricate patterns that shift and change. The sky above is painted in an array of dreamlike colors, blending soft pastels that seem to cascade into the horizon. The beach is adorned with abstract forms that resemble sea creatures, and the waves appear to dance in imaginative swirls. The camera slowly pans across the beach, emphasizing the whimsical and surreal nature of the scene, capturing the beauty and tranquility of the coastal setting." + }, + { + "prompt_en": "The bund Shanghai, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "The scene depicts The Bund in Shanghai, reimagined in Van Gogh's iconic style. The atmosphere is rich with swirling, vibrant brushstrokes reminiscent of the renowned painter's work. The historic skyline is adorned with distinctive buildings, their outlines bold and dynamic against the evening sky, filled with swirling blues and hints of orange from city lights. The Huangpu River reflects these colors, adding a shimmering effect. The camera gently pans across The Bund, capturing the captivating interplay of light and texture, infusing the scene with an artistic, dreamlike quality." + }, + { + "prompt_en": "The bund Shanghai, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "The oil painting depicts a vibrant scene at The Bund in Shanghai. The cityscape is dominated by the elegant historical buildings lining the promenade, each glowing under the soft, ambient lighting of streetlights. The sky is a rich tapestry of deep blues and purples, suggesting early evening. Reflections shimmer in the nearby water, capturing the lights from the buildings and a few boats on the river. The painting has an impressionistic style, with bold strokes and a warm palette that imbue the scene with a lively and charming atmosphere. There is a gentle flow to the composition, with the viewer’s eye naturally drawn along the curve of the promenade and out to the distant skyline." + }, + { + "prompt_en": "The bund Shanghai by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "A visually striking scene captures the essence of the Bund in Shanghai, reimagined in the style of Ukiyo-e, reminiscent of Hokusai's iconic works. The composition showcases the historic waterfront's grand architecture lining the Huangpu River, with intricately detailed buildings. In the foreground, clean lines and bold colors depict waves gently lapping against the riverbank. The river glistens under the soft light, and the atmosphere is serene, with a touch of elegance typical of Ukiyo-e art. The sky is depicted with soft gradients, giving the scene a timeless and ethereal quality. The view is framed to evoke the traditional woodblock print style, with attention to both detail and simplicity." + }, + { + "prompt_en": "The bund Shanghai, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "The video showcases a black and white scene of The Bund in Shanghai. The camera pans slowly across the iconic skyline, highlighting the stunning architecture of historical buildings lining the waterfront. The detailed facades and distinct silhouettes of each building stand out against the sky, creating a classic and timeless atmosphere. The Huangpu River can be seen gently flowing in the foreground, adding a sense of motion to the serene scene. The overall atmosphere is elegant and nostalgic, capturing the essence of this famous location in a monochrome palette." + }, + { + "prompt_en": "The bund Shanghai, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "A pixel art depiction of The Bund in Shanghai showcases the iconic skyline with its historic and modern architecture. The scene is filled with vibrant colors representing the varied structures, and a soft, pixelated glow emanates from the buildings. Small, square pixels create a charming, nostalgic feel. The Huangpu River flows in the foreground, with pixelated ripples creating texture across the water's surface. The sky above exhibits a gentle gradient from dusk into night, completing the atmospheric setting. The camera subtly pans across the scene from left to right, giving a comprehensive view of this iconic locale." + }, + { + "prompt_en": "The bund Shanghai, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "The video captures The Bund in Shanghai, transformed into a vibrant cyberpunk scene. Neon lights in hues of blue, pink, and purple illuminate the skyline and reflect off the waters of the Huangpu River. Futuristic holographic advertisements hover over the historic buildings, casting an otherworldly glow. The atmosphere is bustling, with streams of people and vehicles moving through the streets in the distance. The camera pans smoothly from left to right, highlighting the intricate blend of classic architecture and neon-infused modernity, evoking a sense of futuristic allure." + }, + { + "prompt_en": "The bund Shanghai, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated depiction of The Bund in Shanghai, iconic skyscrapers and historic buildings line the promenade along the Huangpu River. The scene features intricate details with warm, vibrant colors, giving it a lively and bustling atmosphere. Animated lights flicker from the buildings, reflecting off the water's surface. People stroll leisurely along the walkway, taking in the stunning skyline. The camera slowly pans from left to right, capturing the harmonious blend of modernity and tradition that characterizes this famous waterfront area. The animated style adds a whimsical and enchanting touch to the scene." + }, + { + "prompt_en": "The bund Shanghai, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "A watercolor painting captures the iconic skyline of The Bund in Shanghai. The painting features the historic waterfront buildings with their classic architectural details, bathed in soft, muted tones typical of watercolor art. The Huangpu River flows gently in the foreground, reflecting the vivid colors of the sky above. The atmosphere is serene and slightly nostalgic, as the brushstrokes create a dreamy blend of colors. The painting conveys a sense of calmness and timelessness, capturing the essence of this bustling cityscape through an artistic lens." + }, + { + "prompt_en": "The bund Shanghai, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "The scene showcases The Bund in Shanghai styled with surrealism. The iconic skyline, adorned with historic colonial buildings, appears dreamlike. The buildings stretch upward in exaggerated shapes, bending and twisting like a fluid painting. The Huangpu River flows with an ethereal quality, reflecting distorted images of the architecture. The sky is a kaleidoscope of vibrant colors, with floating geometric shapes, adding to the surreal atmosphere. The camera pans slowly across The Bund, enhancing the fantastical and other-worldly ambiance of the scene." + }, + { + "prompt_en": "a shark is swimming in the ocean, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "In a Van Gogh style, a shark glides gracefully through the swirling ocean waters. The waves are depicted in dynamic, bold brushstrokes of deep blues and vibrant turquoise, reminiscent of Van Gogh's iconic style. The shark, with its sleek body and prominent dorsal fin, is rendered in shades of gray and white, contrasting beautifully with the surrounding water. The sky above is a whirl of starry night hues, adding depth to the scene, while the light plays across the surface, evoking a mesmerizing, flowing motion. The camera captures this artistic portrayal from a steady angle, allowing the viewer to fully appreciate the unique combination of natural beauty and artistic expression." + }, + { + "prompt_en": "a shark is swimming in the ocean, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In an oil painting style, a shark is portrayed swimming gracefully through the deep blue ocean. The shark's sleek body is depicted with detailed brushstrokes, emphasizing its powerful movement. The ocean water shimmers with varying shades of blue and hints of green, capturing the play of light beneath the surface. The scene conveys a sense of tranquility and majesty, with the shark moving smoothly from the left to the right side of the canvas. The camera angle is slightly elevated, offering a clear view of the shark as it glides effortlessly through its natural habitat." + }, + { + "prompt_en": "a shark is swimming in the ocean by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "A majestic shark gracefully swims through the ocean, depicted in the Ukiyo-e style reminiscent of Hokusai. The shark has fluid, elegant movements, with its body creating soft waves in the surrounding water. The ocean is illustrated with swirling, stylized waves, characteristic of Hokusai's art, featuring deep blues and whites. The shark's form is detailed, with defined lines and a smooth texture, capturing its powerful yet serene presence. The scene is seen from a slight angle, giving depth and emphasizing the traditional Japanese art style." + }, + { + "prompt_en": "a shark is swimming in the ocean, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "A large, graceful shark is swimming through the ocean, depicted in striking black and white. The shark's strong, streamlined body glides effortlessly through the water, with its tail gently swaying from side to side. The camera captures the shark from a side view, emphasizing the fluidity of its movement and the elegance of its dorsal fin cutting through the water. Light filters down from the surface, casting subtle patterns on the shark's body, enhancing the dramatic monochromatic contrast of the scene. Bubbles rise occasionally, adding to the sense of motion and depth in the underwater environment." + }, + { + "prompt_en": "a shark is swimming in the ocean, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "In a pixel art style, a shark is depicted swimming gracefully in the ocean. The ocean is rendered with shades of blue, creating a pixelated texture of waves and bubbles. The shark, with its distinct pointed dorsal fin and sleek body, moves smoothly from left to right across the frame. The pixel art design gives a charmingly simplistic yet vivid appearance, and the shark's motion appears fluid as it navigates through the stylized waves. The scene captures the serene ambiance of underwater life, with the pixelated water providing a retro aesthetic." + }, + { + "prompt_en": "a shark is swimming in the ocean, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "In an ocean illuminated by vivid neon lights, a sleek shark swims gracefully beneath the surface. The shark's body is highlighted by blue and purple hues, creating a striking contrast against the dark waters. Streams of bioluminescent particles surround the shark, adding to the cyberpunk ambiance. The camera follows the shark as it glides forward, capturing its powerful and fluid movements. The light reflections dance across the shark's sleek skin and the ocean's surface, enhancing the futuristic atmosphere." + }, + { + "prompt_en": "a shark is swimming in the ocean, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, a shark is swimming gracefully through the deep blue ocean. The shark's sleek body glides effortlessly, with its powerful tail propelling it forward. As it moves, its fins occasionally break the surface, sending ripples across the water. The surrounding ocean is filled with soft, animated light beams filtering down from above, creating an enchanting and serene atmosphere. The camera follows the shark from an angled side view, capturing the majestic movement and animated fluidity of its swimming." + }, + { + "prompt_en": "a shark is swimming in the ocean, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "A watercolor painting depicts a dynamic and mesmerizing scene of a shark swimming gracefully in the crystal-clear ocean. The artist has used a blend of soft blues and greens to capture the fluidity of the water. The shark, painted in shades of grey and white, glides effortlessly through the water. Its powerful body and sharp fins are richly detailed, showcasing the beauty of this majestic creature. Light filters through the water, casting gentle reflections on the shark's sleek form. The painting has a tranquil and ethereal atmosphere, evoking a sense of calmness in the viewer. The camera subtly zooms in, enhancing the details in the shark’s coloring and the surrounding ocean hues." + }, + { + "prompt_en": "a shark is swimming in the ocean, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "In a mesmerizing surrealism style, a large shark glides smoothly through the vibrant, multi-colored waters of the ocean. The shark's body is painted with swirling patterns of deep blues, purples, and shimmering gold, which reflect the vivid undersea environment. Around the shark, the ocean is filled with fantastical elements, such as crystal formations and abstract coral shapes. The scene is bathed in an ethereal light, casting unusual shadows and highlights. The camera follows closely behind the shark, creating a sense of wonder and fantasy as the surreal landscape unfolds." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "In a cafe in Paris, a panda is seated at a small round table, surrounded by the vibrant, swirling colors characteristic of Van Gogh's style. The panda is depicted with expressive eyes and a gentle demeanor, carefully sipping from a delicate coffee cup with its paw. The cafe's interior is bathed in warm, golden light, with the soft glow casting intricate shadows and highlights on the panda's fur. In the background, the famous Parisian street scene is visible through an arched window, complete with elegantly painted figures and buildings that capture the cafe's lively yet tranquil atmosphere. The camera pans slowly from the window to the panda, accentuating the rich texture and depth of the artistic scene." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In an enchanting oil painting, a panda is depicted seated at an elegant café in Paris. The setting is beautifully detailed, with the café's outdoor tables and chairs arranged along a scenic Parisian street. The panda, appearing charming and whimsical, is holding a delicate coffee cup in its paw, taking a sip. The café is bustling with a gentle early morning sunlight filtering through the leaves of nearby trees, casting playful shadows across the cobblestone path. The atmosphere blends a sense of warmth and delight, capturing the essence of a leisurely morning in Paris. The use of rich, textured brushstrokes in muted, earthy tones adds depth and vibrancy to this delightful scene." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "In a Parisian cafe setting, a stylized panda is seated at a small wooden table, delicately holding a coffee cup. The scene is rendered in the iconic Ukiyo-e style reminiscent of Hokusai, with bold outlines and intricate details. The panda appears charming and serene, with a gentle expression as it sips the coffee. Around the panda, the cafe ambiance is illustrated with classic Parisian decor, including ornate chairs and small potted plants. The background hints at the elegant architecture of Paris, with softly colored walls and large windows allowing light to fill the space, creating a warm and inviting atmosphere. The camera, stationary, captures this delightful and imaginative blend of cultures with precision and artistry." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "In a charming black-and-white scene, a panda is seated comfortably at a small, round table in a Parisian café. The panda is holding a delicate coffee cup and appears to be sipping coffee contentedly. Around the panda, there are classic café elements such as a wrought iron chair, a patterned tablecloth, and a small vase with a single flower. The café's background includes tall windows, capturing the picturesque streets of Paris. The atmosphere is cozy and quaint, with soft shadows highlighting the panda's relaxed posture and expressive eyes. The camera gently pans across the table, focusing on the panda's serene expression." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "In a pixel art style scene, a charming panda sits at a small café table in Paris, enjoying a cup of coffee. The panda, depicted in black and white pixels, has an endearing expression and is gently lifting a steaming cup to its mouth, surrounded by a checkered tablecloth. The setting features classic Parisian elements like an ornate streetlamp and a window with colorful flower boxes, hinting at a bustling Paris street outside. The colors are bright, with a focus on detail that conveys a cozy and whimsical atmosphere. The camera focuses on the panda, providing a charming view of this unique and delightful moment." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "In a cyberpunk-themed café in Paris, a panda is sitting at a sleek, metallic table near a neon-lit window overlooking a futuristic cityscape. The panda, with its distinctive black and white fur, is carefully holding a steaming cup of coffee with both paws, its gaze fixed on the vibrant streets outside. The café interior glows with neon blue and purple lights, casting an otherworldly ambiance. The panda's surroundings include high-tech gadgets and transparent display panels, adding to the cyberpunk atmosphere. The camera slowly pans from the panda's face, capturing the warm steam rising from the coffee, to the bustling, neon-bathed streets visible through the window." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, a charming panda is seated at a quaint cafe table in Paris, enjoying a cup of coffee. The scene is set with the iconic Eiffel Tower visible in the background, adding to the Parisian ambiance. The panda, with its soft black-and-white fur and expressive eyes, holds the coffee cup with both paws, savoring each sip. The cafe is adorned with colorful flowers and small, elegant tables. A gentle breeze ruffles the panda's fur, and the surrounding atmosphere is filled with warmth and charm, evoking a delightful and whimsical Parisian vibe. The camera gently pans from the panda to the surrounding cafe scene, highlighting the vibrant and inviting atmosphere." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "In a charming watercolor painting, a panda is depicted sitting at a quaint café in Paris. The panda is adorably sipping coffee from a delicate cup, surrounded by the warm and cozy atmosphere typical of a Parisian café. Soft brushstrokes and gentle hues capture the café's charming details, such as small round tables with intricate designs and classic bistro chairs. The background hints at iconic Parisian architecture, with blurred outlines of nearby buildings and cobblestone streets under a gentle afternoon light. The harmonious blend of colors creates a whimsical and serene atmosphere." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "In a surrealist scene, a panda is seated at a small table in an elegant Parisian café. The panda, appearing lifelike yet out of place, is holding a delicate coffee cup in its paws. The café is adorned with vintage decor, and the large window offers a view of the iconic Eiffel Tower in the distance. The lighting is soft and warm, creating a dreamlike atmosphere as the panda sips the steaming coffee. The camera captures the whimsical scene with a gentle panning motion, highlighting the unique blending of reality and fantasy." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "A charming Corgi joyfully plays in a park during sunset, captured in the vibrant and swirling style of Van Gogh. The short, fluffy dog, with its distinctive large ears and expressive face, leaps and frolics on the grassy field. The Corgi's coat is painted with dynamic, colorful brushstrokes, bringing an artistic flair to its joyful movements. The golden sun sets on the horizon, casting a warm, swirling light across the scene, illuminating the Corgi's playful antics. The camera gently pans to follow the Corgi's movements, adding to the whimsical, artistic atmosphere." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In a stunning oil painting style, a cute and happy Corgi plays joyfully in a vibrant park during sunset. The Corgi's fur is richly detailed, showcasing its fluffy, tri-colored coat with perky ears and an expressive face full of joy. As the dog runs, it kicks up small tufts of grass, adding life to the painting. The park is bathed in the warm, golden hues of the setting sun, casting long shadows and creating a serene atmosphere. Tall trees with dappled leaves surround the scene, completing this picturesque moment. The camera subtly pans to follow the Corgi's lively movements, enhancing the dynamic feel of the scene." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "A charming and joyful Corgi is frolicking playfully in a park at sunset. The scene is visually enriched by the ukiyo-e style reminiscent of Hokusai. The sun is low on the horizon, casting gentle, warm hues across the landscape, while the Corgi's fur captures the light in a captivating manner. The park's scenery includes stylized trees and grass that echo the classic Japanese art form, surrounding the lively Corgi. The camera remains steady, capturing the essence of the Corgi's vibrant playfulness amidst this artistically inspired backdrop." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "In a picturesque park at sunset, a cute and happy Corgi is energetically playing. The scene is captured in black and white, giving it a timeless and classic feel. The Corgi, with its distinctive short legs and fluffy coat, joyfully runs around, its ears perked and tail wagging. The dog occasionally leaps into the air, playfully chasing after leaves carried by a gentle breeze. The silhouette of trees frames the scene as the setting sun casts long shadows across the grass. The camera smoothly follows the Corgi's lively movements to capture the essence of the joyful moment." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "In a vibrant pixel art style, a cute and happy Corgi is playfully frolicking in a park. The setting sun bathes the scene in a warm, golden glow, creating a charming atmosphere. The Corgi, with its distinctive short legs and fluffy tail, is captured mid-bounce, as it joyously runs towards the left side of the screen. The park features pixelated trees and grass, adding to the playful and nostalgic look of the art. The sun, depicted as a large, glowing pixelated orb, sets in the horizon, enhancing the whimsical charm of the scene. The camera subtly pans left, following the Corgi's cheerful movement." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "In a vibrant cyberpunk-themed park at sunset, a cute, happy Corgi is energetically playing. The Corgi's fluffy coat gleams with neon accents reflecting the park's futuristic lighting, highlighting its adorable features. The dog is joyfully running through the grass, its tail wagging enthusiastically as it chases a small, glowing ball that matches the neon hues of the surroundings. The setting sun casts a warm, colorful glow, contrasting with the cool, digital landscape of towering structures and holographic displays in the background. The camera follows the Corgi as it bounds towards the camera, capturing its playful spirit and lively movements." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, a cute and happy Corgi is seen playing joyfully in a lush green park during sunset. The Corgi, with its signature short legs and fluffy tail, prances around energetically, occasionally stopping to wag its tail excitedly. The golden rays of the setting sun create a warm and vibrant atmosphere, casting long shadows on the ground. The sky is painted in rich shades of orange and pink hues, adding to the whimsical charm of the scene. The camera gently pans to follow the Corgi's playful movements, capturing the lively and cheerful mood." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "In this watercolor painting, a cute and cheerful Corgi is captured playing in a lush park during sunset. The Corgi, with its fluffy fur and short legs, is energetically running across the grass, its tail wagging in joy. The setting sun casts a warm golden hue across the scene, creating a serene and dreamy atmosphere. The trees in the background are gently swaying, and the sky is painted in soft shades of orange and pink, enhancing the peaceful beauty of the moment. The camera takes a sweeping arc around the Corgi, highlighting its playful nature and the picturesque surroundings." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "In a surrealism-style scene, a cute and happy Corgi with its signature short legs and fluffy fur is energetically playing in a park at sunset. The sun casts an ethereal, golden light across the landscape, with elongated shadows creating a dreamlike atmosphere. The Corgi, with its tongue playfully hanging out and ears perked up, runs joyfully in a zigzag pattern towards the camera, occasionally leaping and frolicking in the lush grass. In the background, the trees and flowers have exaggerated and fantastical shapes, adding to the surreal quality of the scene. The camera follows the Corgi's movements closely to capture its lively and whimsical energy against the enchanting park setting." + }, + { + "prompt_en": "Gwen Stacy reading a book, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "In a vibrant Van Gogh-inspired scene, Gwen Stacy is depicted sitting in a cozy room, immersed in reading a book. Her long, blonde hair flows gracefully around her shoulders as she leans slightly forward, studying the pages intently. She is wearing a charming, vintage-style dress with soft, flowing lines and intricate patterns. The room around her is filled with swirling, vivid colors and bold brushstrokes characteristic of Van Gogh's style. A gentle, warm light bathes the scene, creating an atmosphere of quiet contemplation. The camera gradually zooms in, focusing on Gwen's serene expression and the textured details of the brushstrokes in her surroundings." + }, + { + "prompt_en": "Gwen Stacy reading a book, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In an oil painting style, Gwen Stacy is seated gracefully in an elegant armchair. She is a young, charming woman with long, flowing blonde hair. Gwen is absorbed in reading a book, which she holds delicately with both hands. She is wearing a stylish, pastel-colored blouse, and her serene expression reflects her engagement with the book. The soft, warm ambient light casts gentle shadows, adding depth and richness to the scene. The background includes a hint of a cozy, inviting room, enhancing the painting's tranquil and thoughtful atmosphere." + }, + { + "prompt_en": "Gwen Stacy reading a book by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "Gwen Stacy, an exceptionally beautiful young woman, is portrayed reading a book by Hokusai in a scene styled after Ukiyo-e art. She is depicted with flowing blonde hair and delicate features, wearing a traditional kimono adorned with intricate patterns. The book, featuring Hokusai's iconic wave design, captivates her attention as she gently holds it in her hands. The background is filled with traditional Japanese motifs and soft pastel colors, creating an atmosphere of tranquility and elegance. The scene is framed in a traditional Ukiyo-e style, emphasizing the serene and timeless beauty of the moment. The camera slowly zooms in to focus on Gwen's serene expression and the detailed artistry of the book's cover." + }, + { + "prompt_en": "Gwen Stacy reading a book, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "Gwen Stacy, portrayed in a classic black-and-white style, is sitting comfortably in a cozy chair with a book open in her hands. She is an attractive young woman with long, flowing hair and a composed expression on her face. Gwen is wearing a stylish blouse and a skirt, adding to her elegant appearance. Her posture is relaxed, deeply absorbed in her book. The soft light from a nearby lamp casts gentle shadows, enhancing the tranquil and timeless atmosphere of the scene. The camera slowly zooms in, highlighting Gwen's focused expression and the pages of the book." + }, + { + "prompt_en": "Gwen Stacy reading a book, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "In a charming pixel art style, Gwen Stacy is seated comfortably, absorbed in reading a book. Her blonde hair, styled in a sleek bob, frames her face as she looks down at the pages. She is dressed in a stylish outfit featuring her signature white hoodie with a pink interior, complemented by a softly lit background that enhances the scene’s cozy atmosphere. Her expression is one of serene concentration, with a hint of a smile as she enjoys the book. The camera captures her from a slightly elevated angle, focusing on her and the open book, creating a warm and inviting scene." + }, + { + "prompt_en": "Gwen Stacy reading a book, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "Gwen Stacy, a charming young woman, is sitting in a dimly lit cyberpunk-themed room, absorbed in a book. Her striking platinum blonde hair is styled in a modern, sleek bob with hints of neon pink highlights. She is wearing a fitted, futuristic jacket adorned with glowing LED patterns, giving her a captivating appearance. Her expression is focused and thoughtful as she turns a page. The room features neon lights casting vibrant hues of blue and purple, reflecting off metallic surfaces and tech gadgets around her. The camera slowly pans from behind Gwen over her shoulder, capturing her engrossed in the book amidst the futuristic decor." + }, + { + "prompt_en": "Gwen Stacy reading a book, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, Gwen Stacy, a charming young woman with strikingly beautiful features and a confident expression, is seated comfortably in a cozy room. She has bright eyes and blond hair styled in a sleek bob, and is wearing a stylish outfit with a light-colored sweater. Gwen is holding an open book in her hands, deeply focused on its pages. She is seated in a relaxed posture on a plush chair as the camera slowly zooms in, emphasizing her serene concentration. The warm tones and soft lighting of the room add a gentle, inviting atmosphere to the scene." + }, + { + "prompt_en": "Gwen Stacy reading a book, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "The scene depicts Gwen Stacy, an alluring and elegant young woman, sitting peacefully as she reads a book. Her blonde hair cascades gracefully over her shoulders, and her eyes are focused intently on the pages. She is wearing a chic, light sweater, and a stylish scarf drapes around her neck, adding a touch of charm. The watercolor painting captures soft, pastel hues, blending harmoniously to create a tranquil and artistic atmosphere. Gwen is seated in a cozy chair near a window where gentle, natural light filters through, illuminating her serene expression and the pages of her book. The camera remains still, allowing the viewer to take in the calming and sophisticated ambiance." + }, + { + "prompt_en": "Gwen Stacy reading a book, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "Gwen Stacy, a charming young woman with striking features and flowing blonde hair, sits elegantly poised while reading a book. She is dressed in a stylish, colorful outfit that complements the surrealism style of the scene. Around her, the environment is dreamlike and otherworldly, with floating, melting clocks resembling those in a Salvador Dalí painting. The background features distorted, abstract shapes and vibrant colors that swirl behind her, creating an atmospheric and intriguing setting. The camera subtly pans around her, capturing the whimsical essence of the surreal setting." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "A boat glides leisurely along the Seine River, its sails gently catching the breeze. The scene is depicted in the vibrant, swirling brushstrokes characteristic of Van Gogh's style, with a rich palette of blues, greens, and yellows. In the background, the iconic Eiffel Tower rises majestically, rendered with dynamic lines that capture its intricate ironwork. The sky is a tapestry of swirling clouds and radiant sunlight, casting a shimmering reflection on the water's surface. The camera slowly pans to capture the entire scene, highlighting the enchanting interplay of colors and the tranquil movement of the boat." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In an oil painting style, the scene captures a boat sailing leisurely along the Seine River. The boat is elegantly detailed, with smooth, flowing lines that suggest a gentle and relaxed movement across the river. In the background, the iconic Eiffel Tower rises majestically against a softly glowing sky, bathed in warm, golden hues from the setting sun. The river reflects the sky's colors, creating a harmonious blend of light and shadow across its surface. The atmosphere is serene and enchanting, adding a timeless charm to the scene. The composition is framed to highlight both the graceful boat and the towering Eiffel Tower, achieving a perfect balance in the scene." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "In the style of Ukiyo-e, a charming scene unfolds as a boat sails leisurely along the serene Seine River. The boat, with its traditional design, drifts gracefully across the calm waters. Onboard, a couple of elegantly dressed figures enjoy the scenic journey. In the background, the iconic Eiffel Tower rises majestically, rendered with fine, detailed lines typical of Hokusai's work. The sky, capturing a subtle gradient from the setting sun, features delicate hues that paint a tranquil atmosphere. The swirling waters and distant Parisian buildings are depicted with flowing, organic patterns, enhancing the print's harmonious balance and classic Ukiyo-e aesthetic. The camera gently pans across the scene, encapsulating the tranquil and artistic essence." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "In a captivating black and white scene, a boat gently sails along the tranquil waters of the Seine River. The majestic Eiffel Tower stands in the background, its iconic silhouette rising elegantly against the sky. The boat glides smoothly from right to left, creating gentle ripples on the water's surface. The camera pans slightly to follow the boat's motion, capturing the serene atmosphere and timeless allure of the Parisian landscape." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "In a charming pixel art scene, a small boat sails leisurely along the tranquil Seine River. The boat is adorned with vibrant colors, and small pixelated waves form around it, indicating gentle movement. In the background, the iconic Eiffel Tower stands majestically, composed of intricate pixel designs. The sky is pixelated with soft hues, suggesting a peaceful atmosphere during either dawn or dusk. The entire scene has a nostalgic, retro feel, enhanced by the pixel art style. The camera maintains a steady view, capturing the boat's serene journey and the Eiffel Tower's grandeur in harmony." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "A sleek, futuristic boat sails leisurely along the Seine River, its neon lights casting vibrant reflections on the water's surface. The boat's design features glowing blue and purple accents that give it a distinct cyberpunk aesthetic. In the background, the Eiffel Tower looms majestically, adorned with pulsating neon lights that illuminate its structure with a captivating, otherworldly glow. The sky above displays a mix of deep blues and purples, adding to the cyberpunk atmosphere, while the cityscape surrounding the tower is dotted with neon signs and skyscrapers. The camera smoothly follows the boat's journey down the river, with the iconic and electrified Eiffel Tower visible in the backdrop." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, a charming boat sails leisurely along the Seine River. The boat, painted in vivid colors, moves gracefully across the water, leaving gentle ripples in its wake. The Eiffel Tower stands majestically in the background, silhouetted against the sky as the sun sets, painting the sky with shades of orange and pink. The scene is lively and enchanting, with animated clouds drifting slowly past the iconic landmark. The camera smoothly follows the boat from the side, capturing both the serene river journey and the iconic Parisian backdrop." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "The video opens with a watercolor painting capturing a serene scene along the Seine River. The camera pans across the painting, revealing a charmingly painted boat gently sailing along the river's calm waters. The boat, with its soft blue and white hues, creates a sense of leisurely motion as it glides from left to right across the scene. In the background, the Eiffel Tower stands gracefully, painted in delicate shades of iron grey and brown, adding an iconic touch to the composition. The atmosphere is peaceful, with pastel colors reflecting the tranquil mood of the setting sun." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "In a surrealism-style scene, a boat with a vivid, abstract design sails leisurely along the Seine River. The boat features an unusual shape, with exaggerated curves and vibrant colors swirling across its surface. As it glides smoothly through the water, the Eiffel Tower stands in the background, portrayed with an elongated, distorted form that twists and spirals into the sky. The sky is painted with dreamlike hues of pink and turquoise, complementing the unique, fantastical atmosphere. The camera pans slowly from the boat to the iconic tower, emphasizing the surreal and whimsical elements while capturing the serene movement of the river." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "A couple dressed in elegant formal evening wear is caught in a heavy downpour while making their way home, each holding an umbrella. The man, handsome and confident, is wearing a classic black tuxedo and polished dress shoes. The woman, charming and beautiful, is in a flowing, floor-length gown in a deep, rich hue that complements her features, her hair styled tastefully for the evening. Their expressions show a mix of amusement and surprise as they navigate the wet street. The scene is rendered in a style reminiscent of Van Gogh, with swirling brushstrokes capturing the movement of the rain. The light from street lamps and reflections on the wet pavement create a vibrant, dynamic atmosphere. The camera follows slowly behind the couple, enhancing the immersive experience of the moment." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In an oil painting style scene, a couple is depicted in formal evening wear, caught in a heavy downpour as they try to make their way home. The man is wearing a classic black tuxedo, looking handsome as he holds an umbrella above them. The woman, charming and beautiful, is dressed in an elegant evening gown, holding onto the man's arm. Both umbrellas are struggling against the strong rain, with water droplets painted in fluid, sweeping strokes. The rain creates a textured backdrop, with reflections shimmering on the wet ground and adding a touch of drama to the scene." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "In a style reminiscent of Hokusai's Ukiyo-e, a couple in formal evening wear is depicted walking home during a heavy downpour. The man is elegantly dressed in a tailored dark suit and tie, while the woman wears an exquisite long gown that flows with her movements. Both hold umbrellas, which are adorned with intricate patterns. The rain cascades around them in sheets, creating a sense of movement. The background showcases traditional Japanese architecture and cherry blossom trees, partially obscured by the rain. The overall atmosphere is both dramatic and serene, capturing the beauty of the moment against the stormy backdrop. The camera moves slightly forward, highlighting the intricate details of their attire and the rain's impact on the scene." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "A couple, elegantly dressed in formal evening wear, is caught in a heavy downpour while making their way home. The scene is rendered in black and white, adding a classic and timeless feel. The man, wearing a sharp tuxedo, holds a large umbrella, his other arm wrapped protectively around the woman's shoulder. The woman, dressed in a sophisticated evening gown, clutches her own umbrella, her hair styled in soft waves. Raindrops cascade around them, bouncing off their umbrellas and creating puddles on the ground as they walk briskly towards the camera. The ambient light reflects off the wet pavement, enhancing the dramatic atmosphere. The camera pans slightly to follow their movement, maintaining focus on the rain-soaked surroundings." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "A pixel art scene depicts a couple dressed in formal evening wear walking home in a heavy downpour. The woman, appearing elegant and charming, is wearing a flowing, shimmering gown, and she holds a delicate umbrella above her. The man, handsome and confident, is in a sleek suit, clutching a sturdy umbrella. Raindrops are depicted as animated pixels cascading down, while the umbrellas provide limited shelter, emphasizing the intensity of the rainstorm. Both individuals have a slightly hurried posture, with the man leading and the woman close beside him. The background features pixelated city lights reflecting off puddles on the ground, creating a vibrant and lively atmosphere. The camera maintains a steady eye-level view of the couple from slightly behind, capturing their shared moment amidst the storm." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "A couple in formal evening wear is navigating their way home under the glowing, neon-lit skyline of a futuristic city. They are caught in a heavy downpour, with rain cascading around them. The man is tall and handsome, wearing a sleek black tuxedo that complements his sharp features. His hair is slicked back, adding to his sophisticated appearance. The woman is charming and beautiful, dressed in a stunning, form-fitting evening gown that shimmers in the colorful lights. Her hair is elegantly styled, and she holds a transparent umbrella with illuminated edges that glow in cyberpunk hues. The man's umbrella reflects the neon lights, creating a dazzling spectacle as they move together. The rain creates a glossy sheen on the pavement, enhancing the futuristic atmosphere. The camera follows them as they walk away from it, capturing the interplay of light and rain in this vibrant cityscape." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, a sophisticated couple in formal evening wear navigate their way home as a heavy downpour catches them off guard. The elegant woman, wearing a stunning evening gown, holds a colorful umbrella that partially shields her from the rain. The handsome man beside her is dressed in a sharp tuxedo and holds a classic black umbrella. Despite the rain, they both smile and walk briskly along the deserted street, their umbrellas wobbling slightly in the wind. Raindrops splash on the pavement, creating a lively and whimsical atmosphere. The camera follows them closely, capturing the dynamic motion of the rain and their umbrellas as they continue on their path." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "In a beautiful and atmospheric watercolor painting, a couple is depicted walking in a heavy downpour. They are dressed in elegant formal evening wear, with the woman in a stunning, flowing gown that clings gracefully to her form, and the man in a sharp, tailored tuxedo that complements his handsome features. Both are holding umbrellas, which they struggle to keep steady against the wind. The rain creates a misty ambiance, with water droplets visibly bouncing off the ground and creating small puddles. The couple walks side by side, moving towards the background of the painting, their expressions a mix of surprise and amusement as they navigate the storm. The soft, blended colors of the watercolor medium enhance the romantic and dreamy feel of their journey home." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "A couple dressed in elegant formal evening wear is seen walking in a heavy downpour, with surrealistic elements enhancing the scene. The man is wearing a sleek black tuxedo, while the woman is in a stunning, flowing evening gown. Both hold open umbrellas, but instead of the usual fabric, the umbrella tops are made of translucent glass that refracts the rain into vibrant, colorful patterns. The raindrops appear oversized and suspended in the air, creating a dreamlike atmosphere. The ground is a reflective surface, mirroring the couple’s elegant movements as they briskly pace towards the camera. The lighting casts an ethereal glow, adding to the surreal quality of the scene." + }, + { + "prompt_en": "An astronaut flying in space, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "An astronaut is gracefully floating in the vastness of space, depicted in the iconic swirling brushstrokes of Van Gogh's style. The astronaut's suit is detailed with swirling blues and whites, capturing the texture and depth of the fabric. The visor of the helmet reflects the swirling stars and celestial patterns that fill the background. The surrounding space is a captivating mix of deep blues, vibrant yellows, and shimmering whites, with ethereal swirls that create a sense of movement and wonder. The camera smoothly circles around the astronaut, emphasizing the dreamlike and surreal quality of this cosmic scene." + }, + { + "prompt_en": "An astronaut flying in space, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "In an oil painting style, an astronaut is depicted floating gracefully in space. The astronaut is wearing a pristine white spacesuit, complete with a large reflective helmet that captures the vastness of the cosmos around them. The rich, deep colors of space, featuring vibrant star clusters and swirling nebulae, form a dramatic and awe-inspiring backdrop. The perspective gives a sense of the astronaut drifting towards the viewer, creating a feeling of depth and motion. The brushstrokes of the oil painting highlight both the texture of the spacesuit and the ethereal beauty of the surrounding space. The scene is lit with a celestial glow, emphasizing the serene and majestic atmosphere. The camera slowly zooms in on the astronaut, enhancing the sense of tranquility and wonder in the vastness of space." + }, + { + "prompt_en": "An astronaut flying in space by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "An astronaut is depicted flying weightlessly in space, rendered in the traditional Ukiyo-e style of Japanese art, reminiscent of Hokusai's work. The astronaut's suit is detailed with intricate patterns and shades, capturing the elegance and simplicity characteristic of Ukiyo-e. Behind the astronaut, a backdrop of swirling cosmic elements and stars is illustrated with the flowing curves and blue tones often seen in Hokusai's prints. The composition conveys a sense of serene exploration, with the astronaut's graceful posture set against the infinite depth of space." + }, + { + "prompt_en": "An astronaut flying in space, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "An astronaut is suspended in the vast expanse of space, the scene rendered in black and white. Dressed in a sleek space suit with a reflective visor, the astronaut floats effortlessly against the backdrop of distant stars and the curvature of a planet visible at the bottom of the frame. The astronaut's posture is relaxed, arms slightly open to the sides, as if embracing the weightlessness. The camera gently pans around, giving a 360-degree view of the astronaut's serene flight and the infinite cosmic setting. The absence of color adds a timeless and classic feel to the scene." + }, + { + "prompt_en": "An astronaut flying in space, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "A pixel art scene depicts an astronaut floating amidst the vastness of space. The astronaut's helmet reflects tiny glimmers of stars, and the suit has subtle shades of white and gray, capturing the classic space attire. The astronaut's arms and legs are slightly bent as if drifting in zero gravity, creating a sense of gentle movement. Around the astronaut, a backdrop of twinkling stars and small colorful planets add to the cosmic feel. The limited color palette typical of pixel art gives the scene a nostalgic, retro atmosphere. The camera subtly zooms in to emphasize the vastness of space around the lone astronaut." + }, + { + "prompt_en": "An astronaut flying in space, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "In a cyberpunk-style scene, an astronaut is floating weightlessly in the vastness of space. The astronaut's sleek, technologically advanced suit features neon blue and purple accents that pulsate softly, reflecting the cyberpunk aesthetic. The helmet visor is tinted, revealing only the silhouette of a handsome face. In the background, the universe stretches out, filled with a vivid array of stars and glowing nebulae. The camera gently pans around the astronaut to capture the surreal beauty of his floating motion and the luminous backdrop." + }, + { + "prompt_en": "An astronaut flying in space, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, an astronaut is floating gracefully in the vastness of space. The astronaut's suit is a crisp white with blue accents, complete with a large, reflective helmet visor. The background is a deep black, dotted with twinkling stars and the faint glow of distant galaxies. As the astronaut moves slowly across the scene, the camera pans to follow, capturing the serene motion against the backdrop of the universe. The gentle, fluid movement creates a sense of tranquility and wonder." + }, + { + "prompt_en": "An astronaut flying in space, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "In a watercolor painting, an astronaut is depicted floating gracefully in the vastness of space. The astronaut's suit is intricately detailed with subtle shades of blue, white, and silver, capturing the reflective quality of the material. The astronaut's visor gleams with a hint of the cosmos, reflecting distant stars. Surrounding them is a stunning array of watercolor hues: deep blues, purples, and blacks representing the infinite expanse of space, with twinkling stars dotting the background. The composition conveys a serene and awe-inspiring atmosphere, emphasizing the beauty and mystery of the universe. The camera slowly pans from left to right, enhancing the feeling of the astronaut's gentle drift through space." + }, + { + "prompt_en": "An astronaut flying in space, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "In a surreal scene set against the backdrop of the vast, star-speckled cosmos, an astronaut is seen gracefully floating in space. The astronaut wears a gleaming silver spacesuit with reflective surfaces that mirror the stunning display of colorful nebulas and distant galaxies all around. The scene is dreamlike, as swirls of vibrant colors—deep blues, radiant purples, and fiery oranges—encircle the figure. The astronaut gently moves with an ethereal fluidity, as if engaged in a weightless dance among the stars. The tranquil and otherworldly atmosphere is enhanced by soft, ambient lighting that gives the entire scene a mystical glow. The camera slowly pans around the astronaut, capturing the surreal environment in its entirety." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "Van Gogh style" + } + }, + "refined_prompt": "The video opens with a majestic scene of snow-blanketed rocky mountain peaks encircling deep, twisting canyons. The mountains rise steeply against the soft, swirling snow, casting long, dramatic shadows into the canyons below. The scene is depicted in a Van Gogh style, with vivid, textured brushstrokes giving life to the snow and rocky formations. The canyons snake and bend through the elevated peaks, creating a mesmerizing pattern that draws the viewer's eye deeper into the mountain range. The camera gently glides over the landscape, providing a sweeping view of the intricate play of light and shadow. The atmosphere is serene yet dynamic, capturing the beauty of the natural world in an artistic interpretation." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "oil painting" + } + }, + "refined_prompt": "An oil painting depicts snow-blanketed rocky mountain peaks surrounding and casting shadows over deep, twisting canyons. The mountains rise majestically, their jagged edges creating a dramatic skyline against the clear sky. The snow cover adds a serene and pristine quality, while the canyons below weave through the elevated landscape, leading the viewer's eye on a journey through the rugged terrain. The contrast between the bright snow and the dark shadows in the canyons enhances the painting's depth and natural beauty. The brushstrokes capture the texture of the rocky surfaces and the soft, untouched snow, creating a breathtaking and immersive scene." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "by Hokusai, in the style of Ukiyo" + } + }, + "refined_prompt": "The video opens with a breathtaking view of snow-covered rocky mountain peaks towering majestically. These peaks cast long shadows over the deep, winding canyons below. As the camera gently glides over the landscape, it captures the twisting paths of the canyons that snake through the elevated mountainous terrain. The scene resembles an Ukiyo-e style artwork by Hokusai, with muted colors and intricate details highlighting the serene yet dramatic beauty of the snow-blanketed mountains. The atmosphere is calm and timeless, evoking a sense of awe and tranquility." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "black and white" + } + }, + "refined_prompt": "The scene opens with a sweeping aerial view of snow-blanketed rocky mountain peaks, creating a dramatic black-and-white landscape. The mountains loom large and imposing, casting shadows over the deep canyons below. The camera moves smoothly, capturing the intricate twists and bends of the canyons as they snake through the high-elevated peaks. The interplay of light and shadow highlights the rugged texture of the rocky surface, evoking a sense of icy majesty and solitude." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "pixel art" + } + }, + "refined_prompt": "The scene depicts a pixel art interpretation of majestic snow-covered rocky mountain peaks encircling deep, shadowed canyons. The snow blankets the rugged surface of the mountains, creating a stark contrast with the dark depths of the twisting canyons below. The mountains rise high into the sky, their sharp edges softened by the snow, while the canyons snake through the elevated terrain, adding a sense of depth and mystery. The atmosphere is serene and wintry, with subtle pixel details capturing the nuances of light and shadow. A gentle camera pan from left to right showcases the expansive landscape." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "in cyberpunk style" + } + }, + "refined_prompt": "The video begins with a sweeping aerial view of snow-blanketed rocky mountain peaks surrounding deep, shadowed canyons. The snow glistens under a vivid neon glow, reflecting the cyberpunk style. The camera smoothly glides above the twisting and bending canyons, revealing the intricate patterns they carve through the high-elevated mountains. Bright neon lights in hues of blue and purple infuse the landscape with a surreal atmosphere, enhancing the futuristic theme. The scene captures the stark contrast between the cool shadows of the canyons and the illuminated peaks." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "animated style" + } + }, + "refined_prompt": "In an animated style, the scene depicts snow-blanketed rocky mountain peaks towering above deep canyons. The snow glistens under the soft light, highlighting the rugged and jagged edges of the mountains. The canyons, with their twisting and bending paths, carve a dramatic landscape through the high elevations. Shadows from the mountains stretch across the scene, creating an atmosphere of mystery and grandeur. The camera smoothly pans across the landscape, capturing the contrast between the bright snow and dark shadows, enhancing the serene yet imposing beauty of the mountains." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "watercolor painting" + } + }, + "refined_prompt": "The camera pans over a watercolor painting depicting snow-covered rocky mountain peaks towering around deep canyons. The snow blankets the rugged mountain terrain, with shadows cast by the towering peaks, adding depth to the scene. The canyons twist and wind through the high elevated landscape, creating a dynamic and intricate pathway. The painting's gentle brushstrokes and subtle color variations capture the serene beauty and majesty of this natural wonderland." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style", + "dimension": [ + "appearance_style" + ], + "auxiliary_info": { + "appearance_style": { + "appearance_style": "surrealism style" + } + }, + "refined_prompt": "The scene opens with a sweeping aerial view of snow-covered rocky mountain peaks, towering majestically beneath a clear blue sky. The camera glides smoothly over the breathtaking landscape, capturing the deep canyons that weave through the snowy mountains. Shafts of sunlight cast dramatic shadows across the rugged terrain, enhancing the surreal atmosphere. The canyons twist and turn in an intricate pattern, creating a mesmerizing visual effect. The entire scene is imbued with a sense of grandeur and tranquility." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In super slow motion, a beautiful coastal beach in springtime is showcased as gentle waves lap against the golden sand. The sunlight bathes the scene in a warm, inviting glow, accentuating the vibrant colors of the ocean and the soft sheen on the damp sand. Each wave gracefully approaches and retreats, creating a mesmerizing and tranquil pattern. The motion captures the intricate details of the water's movement, forming delicate ripples and foam as it kisses the shore. The camera pans slowly across the beach, highlighting the serene and idyllic nature of the springtime coastal setting." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a stunning wide shot of a beautiful coastal beach in spring. The serene atmosphere is enhanced by gentle waves lapping at the soft golden sand, reflecting the clear blue sky above. The camera smoothly zooms in, capturing the delicate details of the frothy wave edges as they meet the shore. As the camera closes in on the water, the shimmering turquoise hues of the ocean become more vivid, contrasting beautifully with the sandy beach. The scene is peaceful and bright, evoking a sense of calm and tranquility." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a close-up view of gentle waves lapping onto the soft, golden sand of a beautiful coastal beach. Blossoming wildflowers dot the shoreline, adding splashes of color to the vibrant spring landscape. The sun is shining brightly, creating a warm and inviting atmosphere with reflections shimmering off the water's surface. As the camera begins to zoom out, a wider view is revealed, showcasing the expansive beach stretching towards the horizon, with green cliffs in the distance. Seagulls can be seen soaring above, completing the idyllic springtime coastal setting." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video captures a stunning coastal beach during springtime. The sand is smooth and pristine, while gentle waves lap playfully against the shore, leaving a glistening, wet sheen. The atmosphere is serene and refreshing, with the sunlight casting a soft, golden hue across the landscape. As the camera pans left, it reveals more of the beach's sweeping shoreline, showcasing the interplay of water and sand. The gentle motion of the waves creates an inviting and tranquil ambiance, perfect for a relaxing walk by the sea." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a stunning coastal beach scene during springtime, where gentle waves are lapping rhythmically onto the soft, golden sand. The lighting is bright, with the sunlight casting a warm, inviting glow on the ocean's surface. As the camera slowly pans to the right, the horizon line gracefully shifts, revealing more of the idyllic coastline. Lush greenery borders the sand, creating a vivid contrast with the deep blue of the water. The atmosphere is serene and refreshing, capturing the tranquil essence of a perfect spring day by the sea." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The camera begins with a close-up view of gentle waves lapping against the soft, golden sand of a pristine coastal beach. As the waves recede, they leave delicate patterns in the sand. The sun casts a warm, inviting light over the idyllic scene, enhancing the vibrant colors of the landscape. The camera then tilts up to reveal the expansive stretch of the beach, dotted with greenery that thrives in the spring season. In the distance, the horizon meets a clear blue sky, creating a peaceful and serene atmosphere." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video opens with a wide view of a beautiful coastal beach during springtime. The sky is clear blue, and the sunlight highlights the vibrant colors of the ocean. The camera slowly tilts downwards, revealing gentle waves softly lapping onto the golden sand. The beach is fringed with lush, green vegetation, hinting at the freshness of spring. As the camera tilts further down, the rippling effect of the waves on the sand becomes more prominent, creating a serene and peaceful atmosphere." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene captures a stunning coastal beach in spring, with gentle waves rhythmically lapping onto the soft, golden sand. The water is clear, revealing vibrant blues and greens as it reflects the sunlight. The camera shakes intensely, creating a dynamic and somewhat dramatic effect, as if caught in a moment of excitement or movement, while maintaining focus on the tranquil beauty of the coastal landscape." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video presents a steady and smooth perspective of a beautiful coastal beach in spring. Gentle waves lap against the soft, golden sand, creating a soothing and peaceful atmosphere. The sun shines down warmly, highlighting the fresh greenery along the shoreline and the clear, sparkling water. Seagulls can occasionally be seen flying over the ocean in the distance, enhancing the tranquil scene with a sense of lively nature. The camera gently glides along the beach, offering a seamless view of the serene interaction between land and sea." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a view of a beautiful coastal beach in spring, where gentle waves are lapping against the soft, golden sand. The atmosphere is serene, with the sun casting a warm glow over the landscape. The camera starts with a wide focus on the entire beach, capturing the rhythmic motion of the waves. It then smoothly transitions to a racking focus, drawing attention to the intricate patterns formed by the water as it recedes, leaving shimmering trails on the sand. The contrast between the vibrant blue of the ocean and the sunlight reflecting off the water creates a picturesque springtime beach setting." + }, + { + "prompt_en": "The bund Shanghai, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The camera captures The Bund in Shanghai in super slow motion, with the iconic skyline beautifully illuminated by vibrant city lights. The scene showcases the elegant architecture of the historical buildings on one side, while the modern skyscrapers of Lujiazui, including the Oriental Pearl Tower, rise majestically in the background. The Huangpu River flows serenely between these two contrasting parts of the city. Streetlights and light trails from passing vehicles create a dynamic atmosphere, while pedestrians stroll leisurely along the waterfront promenade. The camera smoothly pans along the waterline, emphasizing the harmonious blend of historical charm and modern sophistication." + }, + { + "prompt_en": "The bund Shanghai, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video starts with a panoramic view of The Bund in Shanghai, capturing the iconic skyline with its majestic blend of historical and modern architecture. The camera zooms in gradually, bringing more focus to the intricate details of the buildings. The shot highlights the elegant Huangpu River in the foreground, reflecting the city lights as evening approaches. The view is bathed in a warm, golden glow from the setting sun, creating a captivating and vibrant atmosphere. As the camera continues to zoom in, it centers on a singular historic building, accentuating its architectural grandeur against the bustling city backdrop." + }, + { + "prompt_en": "The bund Shanghai, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a close-up of the iconic skyline of The Bund in Shanghai, capturing the intricate architectural details of its historic buildings. The lighting casts a warm glow over the structures, highlighting their grandeur. As the camera smoothly zooms out, the panoramic view unfolds to reveal the entire waterfront promenade, lined with illuminated buildings and bustling with activity. The Huangpu River flows alongside, with reflections of the city lights dancing on its surface. The atmosphere feels vibrant and lively, encapsulating the bustling energy of Shanghai." + }, + { + "prompt_en": "The bund Shanghai, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a panoramic view of The Bund in Shanghai. The camera smoothly pans left, revealing the magnificent architectural skyline along the waterfront. The iconic historical buildings are illuminated by ambient city lights, which cast a warm glow across the streets. Along the promenade, people leisurely walk, silhouetted against the shimmering reflections on the Huangpu River. The sky has a soft twilight hue, enhancing the vibrant, lively atmosphere of the bustling city." + }, + { + "prompt_en": "The bund Shanghai, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video captures the iconic skyline of The Bund in Shanghai during the evening, with the city lights illuminating the area. The scene begins with a view of the historic buildings along the waterfront, beautifully lit against the twilight sky. The camera then smoothly pans right, revealing the bustling activity along the promenade and the shimmering reflections on the Huangpu River. The atmosphere is vibrant, with a mix of modern and historical architecture creating a stunning contrast. The glowing lights of the buildings add a sense of elegance to the scene." + }, + { + "prompt_en": "The bund Shanghai, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene starts with a view of the famous waterfront of The Bund in Shanghai. The foreground showcases the bustling pedestrian walkway with people strolling along, while the iconic historic buildings line the opposite side, illuminated with warm lights. The camera tilts upward, revealing the stunning Shanghai skyline with its modern skyscrapers, including the iconic Oriental Pearl Tower and the futuristic design of the Shanghai Tower. The sky transitions from a soft dusk to early evening hues, with city lights creating a vibrant and lively atmosphere." + }, + { + "prompt_en": "The bund Shanghai, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video starts with a view of the sky over The Bund in Shanghai, the fading light of the sunset casting a warm and vibrant glow. The camera tilts down to gradually reveal the iconic skyline, with its historic architecture and modern skyscrapers. The Huangpu River runs alongside The Bund, reflecting the shimmering lights of the city. As the camera completes its tilt, the bustling street comes into view, filled with people walking and vehicles moving along the road. The bright lights of the city create a lively and energetic atmosphere." + }, + { + "prompt_en": "The bund Shanghai, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a view of The Bund in Shanghai, showcasing its impressive skyline with iconic buildings lining the riverside. The scene features a dynamic, intense shaking effect, adding a dramatic and vibrant quality to the bustling cityscape. The camera moves slightly up and down, enhancing the feeling of movement and energy. The lights from the buildings and the reflection on the river create a vibrant and illuminated atmosphere, capturing the essence of Shanghai's lively urban environment." + }, + { + "prompt_en": "The bund Shanghai, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video opens with a steady and smooth perspective of The Bund in Shanghai, showcasing the iconic waterfront promenade. The majestic colonial-era buildings on the left contrast with the modern skyscrapers visible in the background across the river. The scene is lit by the warm glow of streetlights and the colorful illumination of the city skyline. The camera gently pans along the promenade, capturing the vibrant urban atmosphere and the reflection of city lights on the water. The tranquil movement offers a serene view of this historic and bustling area." + }, + { + "prompt_en": "The bund Shanghai, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a view of The Bund in Shanghai. The foreground is initially blurred, highlighting the vibrant colors and lights of the bustling street. As the focus shifts, the iconic European-style buildings become crisp and clear, revealing their majestic architectural details. The city's skyline can be seen in the background, featuring modern skyscrapers illuminated by colorful lights. The atmosphere is lively, with a mix of historical charm and modern vibrancy. The camera gently adjusts focus, creating a smooth transition that captures the dynamic essence of The Bund." + }, + { + "prompt_en": "a shark is swimming in the ocean, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a mesmerizing super slow motion scene, a large shark glides gracefully through the ocean. The camera captures every detail of its movement as it swims forward, with its powerful tail moving rhythmically. The sunlight filters through the water, casting a shimmering pattern on the shark's sleek body. Tiny air bubbles drift gently around the shark, adding to the serene and fluid atmosphere. The camera follows the shark closely, maintaining a focus on its elegant swim through the underwater world." + }, + { + "prompt_en": "a shark is swimming in the ocean, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a view of a shark effortlessly gliding through the crystal-clear ocean. Its sleek, gray body moves gracefully through the water, with its dorsal fin cutting through the surface. As the camera begins to zoom in, the fine details of the shark's skin and the gentle motion of its tail fin become more pronounced. The sunlight filters through the water, creating dappled patterns on the shark's body and emphasizing the vibrant blue hues of the ocean surrounding it. The camera focuses intently on the shark as it continues swimming, creating an immersive and captivating underwater scene." + }, + { + "prompt_en": "a shark is swimming in the ocean, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video starts with a close-up of a sleek, majestic shark gracefully swimming through the clear, blue ocean water, its powerful tail propelling it forward. The sun's rays penetrate the water, illuminating the beautiful patterns on the shark's skin. As the camera begins to zoom out, the shark becomes a smaller part of the vast underwater landscape, revealing a vibrant reef below with colorful corals and other fish swimming nearby. The atmosphere is serene and captivating, highlighting the beauty and vastness of the ocean environment." + }, + { + "prompt_en": "a shark is swimming in the ocean, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In the deep blue ocean, a sleek shark is swimming gracefully beneath the surface. Its streamlined body moves effortlessly through the water as it heads towards the camera from the right side. The camera pans left, capturing the shark's fluid motion and the sunlight filtering through the water above, creating a dappled effect on the ocean floor. The tranquil marine atmosphere is enhanced by the gentle sway of the surrounding sea life." + }, + { + "prompt_en": "a shark is swimming in the ocean, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In the depths of the ocean, a sleek shark gracefully swims through the clear blue water. The shark's smooth, streamlined body glides effortlessly, its fins slicing through the water with precision. The camera begins to pan right, revealing more of the vibrant marine environment. As the view shifts, schools of small fish dart past in shimmering silvery formations, contrasting with the deep blue of the ocean. Dappled sunlight filters through the surface, casting moving patterns of light on the ocean floor, creating a peaceful yet captivating underwater scene." + }, + { + "prompt_en": "a shark is swimming in the ocean, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The camera captures a dynamic scene as a shark gracefully swims through the clear blue ocean, its streamlined body cutting through the water with ease. As the camera executes a smooth tilt upward, starting from the shark's dorsal fin, it gradually reveals the sunlit ocean surface, creating a shimmering pattern of light. The rays from the sun dance on the water, illuminating the marine environment and adding a sense of depth to the scene. Meanwhile, the gentle sway of the water adds a serene yet powerful atmosphere to the underwater world. " + }, + { + "prompt_en": "a shark is swimming in the ocean, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video captures a scene where a sleek shark is swimming gracefully through the clear blue waters of the ocean. The camera tilts down to follow the shark's streamlined body as it moves effortlessly beneath the surface. The light of the sun creates a mesmerizing pattern of ripples and shadows on the ocean floor, enhancing the tranquil yet captivating atmosphere of the underwater world. The gentle swaying of sea plants can be seen in the background, adding to the serene ambiance of the scene." + }, + { + "prompt_en": "a shark is swimming in the ocean, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A powerful shark is swimming in the depths of the ocean, its sleek body gliding effortlessly through the water. The light from above casts a shimmering pattern on its back, creating an intense and dramatic effect. The camera shakes vigorously, enhancing the sense of urgency and energy as the shark moves forward, showcasing its strength and agility. The surrounding water is a deep blue, with occasional bubbles rising to the surface, contributing to the dynamic atmosphere. The camera follows the shark from slightly below, capturing its swift movements and the rippling water around it." + }, + { + "prompt_en": "a shark is swimming in the ocean, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A shark gracefully glides through the deep blue ocean water, its powerful tail providing a steady rhythm as it moves. The camera captures a smooth, steady perspective, following alongside the sleek creature to highlight its streamlined body and distinctive dorsal fin. Rays of sunlight penetrate the water, creating a dappled effect on the ocean floor and casting gentle shadows on the shark's skin. The serene underwater atmosphere enhances the shark's majestic movement through its natural habitat." + }, + { + "prompt_en": "a shark is swimming in the ocean, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A shark gracefully swims through the ocean, moving from left to right, with the sunlight filtering through the water casting dappled patterns on its skin. The focus shifts gradually, starting from the background filled with the deep blues of the ocean depths, to the shark in the foreground, highlighting its streamlined body and powerful fins. Suspended particles in the water create a sense of depth and movement, contributing to the serene undersea atmosphere. The camera gently pans to follow the shark's path, enhancing the sense of fluid motion." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a cozy Parisian café, a charming and fluffy panda is captured in super slow motion as it delicately sips coffee from a small cup. The panda's black and white fur contrasts beautifully with the elegant café interior, adorned with ornate wooden details and a softly lit ambiance. As the panda brings the cup to its mouth, its large, expressive eyes gaze curiously at the coffee. The camera slowly pans around the scene, capturing the playful steam rising from the cup, adding a whimsical touch to the delightful moment." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Inside a cozy café in Paris, a charming panda is seated at a small, elegant table. The atmosphere is warm and inviting, with soft lighting casting a gentle glow over the scene. The panda, looking content and relaxed, holds a delicate white coffee cup in its paw. The cup is filled with steaming coffee, and the aroma seems to fill the air. The camera smoothly zooms in to focus on the panda and the coffee cup, capturing the whimsical and delightful moment amidst the sophisticated Parisian café backdrop." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Inside a quaint Parisian café, a panda is seated comfortably at a small, round table, sipping coffee from a delicate white cup. The cozy café is warmly lit with an inviting atmosphere, and the panda appears content and relaxed. Its fur is fluffy and well-groomed, and it holds the cup with one paw. As the video progresses, the camera begins to zoom out, gradually revealing more of the café's charming interior, adorned with vintage decor, wooden furniture, and a view of the bustling Paris streets through a large window. The scene captures the essence of a leisurely moment in a stylish Parisian setting." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a charming Parisian café, a panda sits at a small round table, holding a delicate porcelain cup of coffee in its paws. The café's cozy interior features vintage wooden chairs and tables, with a vase of fresh flowers on each table. The panda's fur is fluffy and well-groomed, contrasting with the steaming dark coffee in the cup. The panda's eyes are gentle and content as it savors the beverage. The camera smoothly pans left, revealing more of the café's inviting ambiance, with additional patrons enjoying their drinks and the warm, inviting decor that includes stylish wall art and soft, ambient lighting." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a charming Parisian café, a panda is seated at a small round table, sipping coffee from a delicate cup. The panda appears relaxed and content, with a gentle smile on its face as it looks out at the bustling street through the café's large window. The café is cozily decorated, with small flower arrangements and vintage posters adorning the walls. As the camera pans right, it captures the rest of the café, revealing other patrons enjoying their drinks and pastries, adding to the lively and warm atmosphere. The soft, ambient lighting complements the cozy interior, enhancing the panda's serene moment." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A plush panda toy is sitting on a table in a cozy Parisian cafe, holding a small cup as if sipping coffee. The table is adorned with a delicate lace tablecloth, and a decorative vase with fresh flowers sits nearby. As the camera tilts up, it reveals the cafe's charming interior, featuring vintage decor with elegant chandeliers hanging from the ceiling, and large windows showing a sunny view of Paris streets in the background. The warm lighting inside creates an inviting and classic Parisian atmosphere." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a tilt-down camera movement revealing a cozy Parisian café. Soft, ambient lighting creates a warm and inviting atmosphere. The camera focuses on a whimsical scene at a small table where a charming panda is seated. The panda, with its distinctive black-and-white fur, is holding a delicate cup of steaming coffee in its paws. It raises the cup to its mouth, taking a sip. The panda sits comfortably on a wooden chair, surrounded by elegant café décor, including a vase with fresh flowers and a croissant on a plate." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a bustling café in Paris, a panda is humorously seated at a small, round table, enjoying a cup of coffee. The panda, with its characteristic black and white fur and expressive eyes, holds the delicate coffee cup delicately between its paws. The café has an elegant Parisian ambiance, complete with ornate decor and soft, warm lighting. An intense shaking effect adds a surreal, whimsical vibe to the scene, making both the panda and the surroundings vibrate slightly. The camera captures a dynamic view, panning around to showcase the charming interior of the café while maintaining focus on the unusual yet captivating sight of the coffee-sipping panda." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Inside a cozy Parisian café, a charming panda is seated at a small wooden table, sipping coffee from a delicate porcelain cup. The panda, with its fluffy black and white fur, is gently holding the cup with its paws while looking content and relaxed. The café has an inviting atmosphere, with soft, ambient lighting that highlights the panda's natural beauty. Decorative Parisian elements, such as vintage posters and intricate moldings, adorn the walls. The camera maintains a steady and smooth perspective, capturing the panda's delightful presence and the café's warm and stylish setting." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a charming Parisian café, a panda is seen seated at a small, round table sipping coffee from a delicate cup. The camera begins with a focus on the detailed design of the cup, then racks focus to reveal the panda's endearing features. The café's cozy interior is filled with warm lighting, highlighting the rustic wooden furniture and decorative artwork on the walls. The panda sits comfortably and appears content, surrounded by the gentle ambiance of the bustling café." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A cute and cheerful Corgi is energetically playing in a sprawling park at sunset. The golden hues of the sunset create a warm and enchanting atmosphere, accentuating the playful spirit of the dog. In super slow motion, the Corgi bounds through the grass, its ears flopping adorably with each joyful leap. The camera closely follows the Corgi, capturing every detail of its fluffy coat and delighted expression as it revels in the bliss of the moment. The soft lighting enhances the serene and joyful feeling of the scene." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A cute, happy Corgi with a fluffy coat and short legs is playfully frolicking in a green park during sunset. The warm golden hues of the setting sun bathe the scene in a gentle, inviting glow. The Corgi's tail wags excitedly as it trots across the grass, ears perked up and eyes shining with joy. The camera starts with a wide view of the park and then smoothly zooms in to capture a close-up of the Corgi's cheerful expression and the way its fur catches the light." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A cute and happy Corgi with a fluffy tail and bright eyes is energetically jumping and running around in a spacious park. The setting sun bathes the scene in a warm, golden glow, highlighting the Corgi's playful antics as it frolics on the lush, green grass. The camera captures a close-up of the Corgi's joyful expression before zooming out to reveal more of the park's expansive landscape, dotted with a few trees silhouetted against the sunset sky. The peaceful ambiance of the park complements the Corgi's spirited playfulness." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a picturesque park setting during sunset, a cute and happy Corgi is joyfully playing on a lush green lawn. The warm hues of the setting sun bathe the entire scene, highlighting the Corgi's fluffy fur and playful nature. The little dog bounces around energetically, ears perked up and tail wagging excitedly. As the camera pans left, the surrounding trees and open field slowly come into view, adding to the tranquil and joyful atmosphere. The golden light of the sunset creates a serene and heartwarming backdrop for the Corgi's playful antics." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a sunlit park during sunset, a cute and joyful Corgi frolics energetically through the grass. The Corgi's fluffy coat shines with golden hues under the warm evening light. Its ears are perked up and tail wagging excitedly as it playfully hops around. The camera starts with a wide shot of the park, capturing the Corgi's movement. As the camera smoothly pans right, it focuses on the Corgi, following its playful leaps and bounds, with a backdrop of colorful trees and the setting sun." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a sunlit park during sunset, a cute and happy Corgi with its fluffy tail wagging energetically plays joyfully on the green grass. The Corgi's short legs carry it swiftly as it chases a small ball, radiating excitement with its ears perked up and tongue out in a cheerful expression. The warm, golden hues of the setting sun illuminate the scene, casting a soft glow on the playful dog. The camera begins to tilt up, transitioning from the Corgi's playful antics to reveal the expansive sky, painted with vibrant sunset colors above the serene park landscape." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video captures a cute, happy Corgi energetically playing in a park during sunset. The scene starts with a view of the golden sky as the sun sets, creating a warm and inviting atmosphere. The camera then tilts down to reveal the Corgi, with its short legs and fluffy tail, bounding joyfully across the grass. The Corgi's ears perk up and its tongue is out, expressing pure delight as it scampers around. The soft, orange light from the setting sun highlights the Corgi's fur, adding to the charming scene." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a sunlit park during sunset, a cute, happy Corgi is energetically playing on the grass. The Corgi has a shiny coat with characteristic short legs and a joyous expression, batting its tail furiously. As the dog runs around, the playful atmosphere is accentuated by the sun casting a golden hue over the scene. The camera captures the Corgi from a low angle, with an intense shaking effect added to the video, highlighting the dog's excitement and exuberance. The warm colors of the sunset enhance the lively and cheerful mood of the moment." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A cute, happy Corgi is energetically playing in a park during sunset. The dog's short legs move quickly as it frolics through the grass, with its ears perked up and tongue joyfully hanging out. The warm golden hues of the setting sun bathe the park, highlighting the Corgi's fluffy coat and casting soft shadows on the ground. The camera maintains a steady and smooth perspective, capturing the Corgi's delightful antics as it occasionally looks towards the camera with bright, playful eyes. The tranquil and picturesque atmosphere enhances the overall joyous mood." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A cute, cheerful Corgi is playfully frolicking in a green park during sunset. The camera begins with a focus on the Corgi, showcasing its fluffy coat and short legs as it energetically runs around, tail wagging excitedly. As the Corgi moves towards and away from the camera, the scene's focus racks, softly blurring the dog momentarily before sharpening on the vibrant orange and pink hues of the sunset sky in the background. The warm light bathes the scene, highlighting the lively atmosphere and the Corgi's joyful expressions." + }, + { + "prompt_en": "Gwen Stacy reading a book, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy, a charming young woman with long, flowing blonde hair and striking features, is seated comfortably in a cozy room, engrossed in a book. She wears a stylish outfit, with a soft-knit sweater that complements her relaxed demeanor. The camera captures her in super slow motion, focusing on her delicate movements as she turns a page. Her expression is one of serene concentration, with her eyes scanning the text thoughtfully. The lighting is warm and soft, creating an inviting and peaceful atmosphere as dust motes gently float in the sunlight. The camera gradually zooms in on Gwen's face, showcasing her gentle smile and the gleam of intrigue in her eyes." + }, + { + "prompt_en": "Gwen Stacy reading a book, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy, a beautiful young woman with striking blonde hair, is sitting comfortably on a plush sofa. She is deeply engrossed in reading a book, her eyes scanning the lines with intent focus. Gwen is wearing an elegant sweater and jeans, and her expression is serene and thoughtful. The camera starts with a wider shot, capturing Gwen in her cozy reading environment, and then smoothly zooms in to focus on her face, showing her subtly expressive eyebrows and gentle smile. Soft lighting enhances the warm, peaceful atmosphere of the scene." + }, + { + "prompt_en": "Gwen Stacy reading a book, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy, a beautiful young woman with striking features and blonde hair styled into a sleek bob, is sitting comfortably, engrossed in a book. She is wearing a stylish purple top and jeans, and she appears relaxed and focused as she reads. Her expression is one of concentration and curiosity as her eyes follow the lines of text. The scene begins with a close-up of Gwen's face and the book, capturing the peaceful moment. The camera slowly zooms out to reveal her surroundings—a cozy room with soft lighting, a plush chair, and a small side table with a steaming cup of tea and a few books stacked beside it. The atmosphere is tranquil and inviting." + }, + { + "prompt_en": "Gwen Stacy reading a book, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy, a charming young woman, is seated comfortably in a cozy room, engrossed in a book. Her striking blonde hair falls gracefully around her shoulders, creating a soft frame for her facial features. She is wearing a stylish sweater and jeans, exuding a casual and relaxed vibe. Gwen's expression is one of focused curiosity as she turns each page. The room is warmly lit, enhancing the serene and inviting atmosphere. The camera begins with a close-up of Gwen's face and the book, then pans smoothly to the left, revealing a glimpse of a bookshelf filled with more books and a window letting in soft natural light." + }, + { + "prompt_en": "Gwen Stacy reading a book, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy is sitting comfortably in a cozy room, absorbed in a book she's reading. She is a charming young woman with long, flowing blonde hair and striking features. Gwen is wearing a stylish, casual outfit consisting of a soft sweater and jeans, perfectly complementing her serene and thoughtful expression. The camera begins with a close-up of Gwen's face, capturing her focused gaze as she turns a page. It then pans to the right, revealing more of the inviting room filled with warm ambient lighting, a plush armchair beside her, and a nearby bookshelf lined with various intriguing titles, further enhancing the tranquil atmosphere." + }, + { + "prompt_en": "Gwen Stacy reading a book, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy sits comfortably in a cozy room, absorbed in reading a book. She is a strikingly beautiful young woman with long, flowing blonde hair and bright, expressive eyes. Gwen wears a stylish, casual outfit with a soft sweater and jeans, exuding charm and elegance. She appears relaxed, leaning slightly back in her chair, with a gentle, focused expression as she turns a page. The atmosphere is calm and intimate, with warm lighting illuminating her features. The camera starts with a close-up on the book and then tilts up slowly to capture Gwen’s serene face as she continues to read." + }, + { + "prompt_en": "Gwen Stacy reading a book, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy is sitting in a cozy, softly lit room, engrossed in reading a book. She is a strikingly beautiful young woman with long blonde hair cascading over her shoulders. Her expressive blue eyes are focused on the pages of the book she holds delicately in her hands. Gwen is wearing a stylish, casual outfit—a fitted light-blue sweater and dark jeans. She has a serene and thoughtful expression as she absorbs the contents of the book. The scene presents a warm, inviting atmosphere, with the gentle light creating a peaceful ambiance. As the camera tilts down, it captures the book's cover, revealing its intricate design, and continues to show Gwen's relaxed posture and the cozy surrounding space." + }, + { + "prompt_en": "Gwen Stacy reading a book, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy is seated comfortably on a plush armchair, her attention fully absorbed in the book she is reading. She is a charming, beautiful young woman with long, flowing blonde hair and bright eyes, wearing a stylish casual outfit with a cozy sweater. As she reads, an intense shaking effect surrounds her, adding a sense of urgency and drama to the scene. The camera gently zooms in on Gwen, capturing her focused expression and the book's cover. The soft lighting in the room provides a warm and intimate atmosphere, contrasting with the intensity of the shaking effect." + }, + { + "prompt_en": "Gwen Stacy reading a book, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy, a beautiful young woman with striking features and long blonde hair, is sitting comfortably in a cozy corner, absorbed in reading a book. She has an intelligent and engaged expression, her eyes focused on the pages. Gwen is dressed stylishly in a light, elegant sweater and jeans, exuding a sense of charm and sophistication. The camera maintains a steady and smooth perspective, gently panning around her. The soft lighting creates a warm and inviting atmosphere, highlighting the quiet concentration of Gwen as she delves into her book, surrounded by a subtle, serene setting." + }, + { + "prompt_en": "Gwen Stacy reading a book, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Gwen Stacy, an exceptionally charming and beautiful young woman, is seated comfortably in a cozy, well-lit room. Her long, blonde hair falls elegantly over her shoulders as she intently reads a book held gently in her hands. She's dressed in a stylish, casual outfit that complements her graceful appearance. As the camera begins with a focus on her serene expression, it slowly shifts to rack focus to the book's pages, revealing an intricately designed cover. The warm lighting enhances the tranquil and inviting atmosphere of the room, highlighting Gwen's captivating presence." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In super slow motion, a boat glides gracefully along the serene waters of the Seine River. The majestic Eiffel Tower stands proudly in the background, casting its iconic silhouette against a clear blue sky. As the boat moves gently from right to left past the camera, ripples form in the water around its sleek hull, creating captivating patterns. Soft light reflects off the river, enhancing the tranquil and romantic atmosphere of Paris. The camera slowly pans to follow the boat, keeping both the boat and the Eiffel Tower prominently in view." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene begins with a wide view capturing a boat leisurely sailing along the Seine River, with the iconic Eiffel Tower towering in the background. The boat glides smoothly over the shimmering water, under a clear blue sky. The Eiffel Tower stands majestically, framed by the lush greenery of nearby trees. As the video progresses, the camera gradually zooms in, focusing more closely on the boat as it continues its gentle voyage down the river, creating subtle ripples on the water's surface. The atmosphere is serene and picturesque, perfect for an idyllic Parisian moment." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A boat is sailing leisurely along the Seine River, moving gently with the current. In the background, the iconic Eiffel Tower rises gracefully against the clear blue sky, adding a majestic touch to the scene. The boat glides smoothly through the water, with a few passengers enjoying the picturesque view and the calm ambiance. The camera starts with a close view of the boat, highlighting its polished wooden deck and passengers, then gradually zooms out to reveal the broader landscape of the river. The charming Parisian architecture lines the riverbanks, enhancing the elegant visual composition." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A boat is sailing leisurely along the Seine River, its pace relaxed and graceful as it glides through the water. The iconic Eiffel Tower stands majestically in the background, silhouetted against the clear sky. The sunlight illuminates the scene, casting gentle reflections on the river's surface. As the camera pans left, the boat and the tower gradually move out of the frame, revealing more of the charming Parisian landscape along the riverbanks, with trees and historic architecture adding to the picturesque view." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a boat gliding smoothly along the Seine River, its gentle wake rippling the water's surface. The boat is elegantly designed, featuring a sleek hull and a calm, leisurely pace. In the background, the majestic Eiffel Tower rises against the skyline, illuminated by the sun. The sky is clear, casting a picturesque light over Paris. As the camera pans right, it moves smoothly following the boat's path, revealing more of the river and its iconic bridges. The serene atmosphere is enhanced by the tranquil flow of the river and the historical architecture lining its banks." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a view of a boat sailing leisurely along the Seine River. The boat is smoothly gliding through the water, creating gentle ripples as it moves forward. The scene is serene, with sunlight reflecting off the water's surface, adding a shimmering effect. As the camera tilts up, the iconic Eiffel Tower comes into view, standing majestically against a clear blue sky. The combination of the flowing river and the Eiffel Tower in the background creates a picturesque and tranquil atmosphere." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a view of the iconic Eiffel Tower standing tall against a clear blue sky. As the camera tilts downward, it reveals a boat gliding leisurely along the calm waters of the Seine River. The boat, painted in white, reflects the sun's gleam, creating a serene and picturesque scene. The surrounding greenery and riverbanks add to the tranquil atmosphere. The boat, sailing smoothly from left to right, creates gentle ripples on the river's surface, enhancing the composition." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A boat is sailing leisurely along the Seine River, with the majestic Eiffel Tower standing tall in the background. The sky is clear, with a warm glow illuminating the iconic landmark. The boat is gracefully moving from left to right, leaving a gentle wake behind it as it glides on the water. A subtle, intense shaking effect mimics the sensation of watching from a distance with the camera slightly unsteady, adding energy and motion to the scene. The camera follows the boat, keeping the Eiffel Tower centered in the frame." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A boat is peacefully sailing along the Seine River, with the iconic Eiffel Tower rising majestically in the background. The camera captures a steady and smooth perspective, moving parallel to the boat as it glides gently through the calm waters. The sunlight reflects off the surface of the river, creating sparkling highlights. The vessel, with its white exterior, contrasts beautifully against the lush green trees lining the riverbanks. The Eiffel Tower stands tall, adding a touch of grandeur to the serene scene." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a serene scene, a boat sails leisurely along the Seine River, its reflection shimmering on the water's surface. The camera initially focuses on the boat, capturing details like the passengers on deck and the gentle ripples as it moves smoothly across the water. Slowly, the focus shifts to the iconic Eiffel Tower standing majestically in the background. The tower, bathed in the soft glow of the setting sun, dominates the Parisian skyline. The atmosphere is calm and picturesque, and the camera slowly pulls back for a wider view, blending the boat and the tower seamlessly into the tranquil scene." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In super slow motion, a couple elegantly dressed in formal evening wear is seen walking along a dimly lit city street. The man is wearing a sleek black tuxedo, and the woman is adorned in a graceful, shimmering black evening gown. The street is illuminated by the soft glow of streetlights. As they walk together, each holding a large umbrella, they are caught in a heavy downpour. The rain cascades around them in mesmerizing patterns, splashing onto the street and creating a reflective surface. The woman's gown subtly sparkles, while the man's attire remains dapper. The couple shares a warm, intimate glance, their expressions filled with surprise and delight as droplets fall visibly and gracefully in the slow-motion scene. The camera gently pans to capture the enchanting atmosphere and their synchronized movements amidst the downpour. " + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A well-dressed couple is walking together through a city street under a heavy downpour. The man, looking dapper in a sharply tailored black tuxedo, holds the umbrella firmly as he shields both of them from the rain. The woman, elegantly dressed in a long, flowing gown, clutches the man's arm with one hand while holding a small evening purse in the other. Both display a sense of charm and poise despite the weather, with raindrops bouncing off their umbrellas. The camera zooms in to capture their expressions, showing the woman's radiant smile contrasting with the man's focused, yet relaxed demeanor. The romantic atmosphere is enhanced by the shimmering reflections of city lights on the wet pavement." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A couple elegantly dressed in formal evening wear is caught in a heavy downpour on their way home. The woman, looking stunning with an intricate updo and in a sleek, long gown, clutches a black umbrella as the rain pours down. Beside her, the man, handsome in a tailored tuxedo, holds his umbrella firmly, trying to shield himself and his partner from the rainfall. Their expressions reflect a mix of surprise and amusement. The scene is set against a dimly lit street with glistening reflections on the wet pavement. As the camera gradually zooms out, the rain intensifies, and the couple becomes part of the broader cityscape, with streetlights creating a softly glowing backdrop." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A couple is caught in a heavy downpour as they walk along a city sidewalk, trying to shield themselves with umbrellas. The woman, elegant and charming, is wearing a stunning long gown that shimmers, while the man looks handsome in a classic tuxedo. Raindrops bounce off their umbrellas, and their outfits glisten from the rain. They cautiously navigate puddles on the sidewalk. The camera pans left, following their progress as they make their way through the rain-soaked street, capturing the dynamic and romantic atmosphere of the evening." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In a dramatic scene, a couple dressed in elegant formal evening wear is caught in a sudden, heavy downpour. The man is wearing a tailored black tuxedo, while the woman is dressed in a glamorous, flowing evening gown. Both are holding umbrellas to shield themselves from the rain as they make their way home. The camera pans right, revealing the couple's hurried steps on the rain-soaked sidewalk. The raindrops patter audibly against their umbrellas, and the streetlights cast a soft glow, illuminating the shimmering raindrops around them. Despite the unexpected weather, they share an amused glance, adding a sense of charm and spontaneity to the moment." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A couple dressed in elegant formal evening wear is making their way along a dimly lit street, illuminated by streetlights shimmering in the heavy downpour. The woman, with graceful poise, holds a sleek black umbrella, wearing a stunning, form-fitting gown that glistens with raindrops. The man, exuding charm, is in a tailored tuxedo, holding a classic umbrella with one hand while the other is held romantically around the woman's waist as they walk towards the camera. Raindrops cascade around them, creating a romantic and cinematic ambiance. The camera tilts up, capturing the couple from their feet to their smiling faces, emphasizing their happiness amid the rain." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A couple, elegantly dressed in formal evening wear, is caught in a sudden heavy downpour as they walk home. The woman, stunningly beautiful, is in a flowing evening gown that shimmers even in the rain, while the handsome man is dressed in a sharp tuxedo. Both are holding umbrellas, the woman's is a delicate lace, and the man's is a classic black with a wooden handle. The rain pours down heavily, creating a dramatic atmosphere as they navigate through the wet streets. The camera starts at a high angle, showing the couple beneath their umbrellas, then tilts down to reveal rainwater splashing around their feet and puddles forming on the ground as they continue their journey home." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A couple, elegantly dressed in formal evening wear, is seen walking along a dimly lit street during a heavy downpour. The man, handsome and distinguished, wears a classic black tuxedo with a neatly styled haircut. The woman, charming and beautiful, is adorned with a stunning floor-length gown that shimmers slightly as they move. Both hold umbrellas, trying to shield themselves from the intense rain. \n\nAs they make their way home, the downpour creates a powerful atmosphere, with raindrops bouncing off the pavement and splashing around them. The camera follows the couple, moving in sync with their steps but incorporates an intense shaking effect as if mimicking the blustery wind and storm conditions, adding a sense of urgency and drama to the scene. The streetlights cast a soft, flickering glow on the wet ground, enhancing the moody and dynamic ambiance." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A well-dressed couple, consisting of a handsome man in a sharp black tuxedo and a stunning woman in an elegant flowing evening gown, are caught in a heavy downpour on their way home. Each holds a large umbrella, the rain creating a rhythmic sound as it hits the umbrella surfaces. They move gracefully through the wet street, the man's hand gently guiding the woman as they walk side by side. The streetlights create a soft glow, reflecting off the wet pavement and illuminating the rain-soaked scene as they move towards the camera. The camera captures the couple with steady, smooth motion, enhancing the elegance and charm of the moment." + }, + { + "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "A sophisticated couple in formal evening wear is walking towards the camera, caught in a heavy downpour. The man, handsome and poised, is dressed in a tailored dark suit, while the woman, charming and elegant, is wearing an exquisite evening gown that drapes beautifully despite the rain. Both hold umbrellas, the man's in a classic black, and the woman's a more fashionable pattern. Their expressions show a mix of surprise and amusement as they navigate through the rain. The camera initially focuses on the couple as they move closer, the rain drops creating a textured backdrop. It then racks focus to emphasize the rain pouring around them, highlighting the intensity of the downpour before shifting back to their faces." + }, + { + "prompt_en": "An astronaut flying in space, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut floats gracefully in the vastness of outer space, captured in super slow motion. The astronaut, wearing a pristine white space suit adorned with reflective visors and intricate details, glides effortlessly with arms slightly bent at the elbows and legs gently extended, creating a serene and weightless effect. The deep, star-filled backdrop adds to the sense of infinite depth and wonder. The camera subtly pans to the side, enhancing the perception of the astronaut's delicate, slow-motion movement through the silent, awe-inspiring expanse." + }, + { + "prompt_en": "An astronaut flying in space, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut is floating gracefully in the vastness of space, surrounded by a backdrop of twinkling stars. The astronaut's suit is sleek and white, reflecting the distant glow of the stars, and features intricate details in its design. The helmet's visor gleams, partially obscuring the astronaut's face while providing a view of the galaxy. The camera slowly zooms in, capturing a closer look at the intricate details of the suit and the face shield's reflection of the surrounding space. The serene and timeless atmosphere emphasizes the awe-inspiring beauty of the universe." + }, + { + "prompt_en": "An astronaut flying in space, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a close-up of an astronaut floating in the vastness of space. The astronaut, wearing a white space suit adorned with emblems, has a reflective visor that mirrors the distant, twinkling stars. As the camera gradually zooms out, the curvature of Earth comes into view below, glowing with a soft blue hue. The astronaut moves with slow, gentle motions, arms outstretched as if reaching towards the infinite expanse. The camera continues to pull back, revealing the peace and majesty of the universe, accentuated by the astronaut's serene expression and graceful posture in zero gravity." + }, + { + "prompt_en": "An astronaut flying in space, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut is floating gracefully through the vast expanse of space, with a backdrop of twinkling stars. The astronaut's sleek suit shines with a reflective hue, and a distinctive helmet with a clear visor reveals the determined and focused expression on his face. His movements are smooth and controlled as he drifts effortlessly, arms slightly bent at the elbows. As the camera pans left, the view captures more of the infinite galaxy, with distant planets and colorful nebulae adding depth and wonder to the scene. The deep blackness of space highlights the serene yet awe-inspiring atmosphere." + }, + { + "prompt_en": "An astronaut flying in space, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut is floating gracefully in the vastness of space, surrounded by a star-filled, deep black void. The astronaut, in a sleek white spacesuit adorned with patches and gear, reflects the distant light from the stars. The person has a large, mirrored visor that obscures their face, adding to the mystery of the scene. As they drift, their body is slightly tilted with one arm outstretched, giving the impression of gentle motion. The camera smoothly pans right, revealing more of the surrounding celestial expanse, including a faint glow from a distant galaxy in the background. The scene is serene and awe-inspiring, emphasizing the beauty and isolation of space." + }, + { + "prompt_en": "An astronaut flying in space, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut is floating weightlessly in the vast expanse of space, with a stunning backdrop of twinkling stars and the distant curve of a planet's horizon. The astronaut's suit is brilliantly white, contrasting against the deep blackness of space, and the reflective helmet visor catches glints of light from the nearby sun. As the camera tilts upwards, the view shifts to reveal the immensity of space above, emphasizing the astronaut's solitude and the infinite cosmos. This upward tilt imbues the scene with a sense of wonder and exploration." + }, + { + "prompt_en": "An astronaut flying in space, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "Amidst the vastness of space, an astronaut drifts gracefully. The blackness of space is studded with distant stars, creating a serene yet awe-inspiring backdrop. The astronaut, a handsome man with bright eyes and a confident demeanor, is clad in a sleek, white spacesuit featuring various patches and a reflective visor. As he floats, his movements are slow and controlled, arms gently moving as if swimming through the void. The camera tilts down, revealing the curvature of a vibrant blue planet beneath him, its beauty adding depth and wonder to the cosmic scene." + }, + { + "prompt_en": "An astronaut flying in space, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut is floating in the vastness of space, surrounded by the infinite darkness dotted with distant stars. The astronaut, wearing a pristine white space suit with a reflective visor, experiences an intense shaking motion. This shaking effect gives a sense of turbulence, causing the astronaut to rock slightly back and forth, with the reflection of stars shimmering across the visor. The camera captures the scene with slow zoom-in movement to emphasize the powerful sensation of being in the unending void, enhancing the dramatic atmosphere of this cosmic environment." + }, + { + "prompt_en": "An astronaut flying in space, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "In the vastness of space, an astronaut is gracefully floating, surrounded by a star-studded black sky. The astronaut, wearing a sleek and polished white spacesuit, complete with a reflective visor and visible patches, moves in a slow, controlled manner. The camera maintains a steady and smooth perspective, providing a clear view of the astronaut's serene and weightless motion. In the distance, the curvature of a planet adds depth to the scene, illuminated by a soft glow of sunlight casting gentle shadows on the astronaut's suit." + }, + { + "prompt_en": "An astronaut flying in space, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "An astronaut is floating weightlessly in the vastness of space, with the dark expanse of the cosmos dotted with shimmering stars all around. The astronaut, wearing a pristine white spacesuit with a reflective visor, appears to be gracefully shifting position, reaching forward as if adjusting something on a nearby module. The camera begins focused on the astronaut, capturing the intricate details of the suit and the subtle movements. Then, the focus racks to a distant celestial object, perhaps a planet or a cluster of stars, creating a breathtaking backdrop for the scene. The slow camera movement adds a sense of awe and wonder to the cosmic setting." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with an expansive view of snow-blanketed rocky mountains standing majestically under a clear blue sky. The sun casts long shadows across the landscape, emphasizing the rugged texture of the peaks. The camera glides smoothly over the snow-covered mountains, capturing the dramatic depth and curves of the canyons below. These canyons twist and bend gracefully through the elevated peaks, creating a sense of awe and grandeur. The super slow motion effect highlights wind-blown snow particles and the serene beauty of this winter wonderland. The camera pans and tilts gently to follow the contours of the canyons, enhancing the sense of immersion." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The camera begins with an expansive aerial view of snow-blanketed rocky mountain peaks surrounding a series of deep canyons. The peaks, dusted in sparkling white snow, tower majestically above, casting long shadows over the landscape. The canyons below twist and bend, creating intricate patterns as they snake through the elevated terrain. As the camera zooms in, the rugged texture of the mountains and the depth of the canyons become more pronounced, highlighting the dramatic beauty of the scene. A crisp, clear atmosphere enhances the breathtaking view." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a close-up view of snow-blanketed, jagged rocky mountain peaks towering high in the sky. The snow glistens in the soft sunlight, creating a serene and majestic atmosphere. Below, deep canyons twist and bend through the terrain, casting long, dark shadows that contrast with the bright snow. As the camera zooms out, a breathtaking panoramic view of the entire mountainous landscape unfolds, showcasing the vastness of the peaks and the intricate network of the canyons beneath. The atmosphere is peaceful yet awe-inspiring, with a sense of untouched natural beauty." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video opens with a breathtaking panoramic view of snow-blanketed rocky mountain peaks, which rise majestically against a clear blue sky. The bright sunlight casts long, dramatic shadows over the deep canyons below. These canyons, carved through the high elevation of the mountain range, twist and bend with striking natural beauty. The camera smoothly pans to the left, revealing even more of the rugged, snow-covered landscape, emphasizing the vastness and majesty of this breathtaking natural scenery. The cool, crisp atmosphere is imbued with a sense of tranquility and awe." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene begins with a breathtaking view of snow-blanketed rocky mountain peaks towering majestically over deep, shadowy canyons. The snow glistens under the bright sunlight, creating a striking contrast with the rugged, dark rocks. The canyons twist and bend intricately through the lofty, elevated mountains, presenting an awe-inspiring landscape of nature's raw beauty. The camera smoothly pans to the right, revealing more of these dramatic formations, enhancing the grandeur and scale of the mountainous terrain." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with a view of snow-blanketed rocky mountain peaks, creating a majestic and rugged landscape. The deep canyons cut through the mountains, casting long shadows that accentuate their depth and twisting paths. As the camera tilts up, it reveals the intricate network of canyons winding and bending through the high elevated peaks. The snow adds a serene and pristine atmosphere, while the clear blue sky overhead enhances the sense of vastness." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video begins with a breathtaking aerial view of snow-blanketed rocky mountain peaks towering over deep canyons. The snow-covered peaks glisten under the soft light, creating a serene and majestic atmosphere. As the camera tilts down, it reveals the canyons below, which twist and bend dramatically through the high-elevated terrain. The shadows cast by the surrounding mountains enhance the depth and contours of the canyons, highlighting their intricate shapes and textures as the camera smoothly pans downwards." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene opens with an expansive view of snow-covered, rocky mountain peaks, their rugged surfaces glistening under bright daylight. As the camera pans, it reveals deep canyons snaking their way through the high-elevated mountains. The canyons twist and bend dramatically, emphasizing the grandeur of the landscape. Shadows are cast deep into the canyon crevices, adding depth and mystery. The camera then moves smoothly above the scene, capturing the intricate paths of the canyons. An intense shaking effect is applied, enhancing the majestic and powerful presence of the snowy peaks and deep canyons." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The video offers a breathtaking view of snow-blanketed rocky mountain peaks towering over and casting shadows across deep canyons. The camera smoothly glides through the scene, capturing the winding pathways of the canyons as they twist and bend amidst the majestic mountain range. The cool, crisp atmosphere is evident as the sun illuminates the snow, creating a stunning contrast with the rugged gray of the rocks. The elevated perspective enhances the grandeur of the mountains and the intricate formations of the canyons below." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus", + "dimension": [ + "temporal_style" + ], + "refined_prompt": "The scene showcases towering rocky mountain peaks, blanketed in snow, casting shadows over the deep, winding canyons below. The camera begins with a wide shot, highlighting the grandeur of the snow-covered mountains and the intricate twists and turns of the canyons as they carve through the elevated landscape. As the focus shifts, the details of the rugged terrain become more pronounced, capturing the sharp contrasts between the brilliant white snow and the dark, rocky surfaces. The atmosphere is majestic and tranquil, with the sunlight accentuating the natural contours of the mountains and canyons." + }, + { + "prompt_en": "Close up of grapes on a rotating table.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A close-up view captures a cluster of fresh, vibrant grapes resting on a rotating table. The light glistens off the smooth, plump grapes, highlighting their rich purples and greens. As the table gently rotates, the camera follows the movement, allowing different angles of the grapes to be illuminated. Shadows subtly dance across them, creating depth and texture in the scene. The atmosphere is serene and elegant, emphasizing the natural beauty and freshness of the fruit." + }, + { + "prompt_en": "Turtle swimming in ocean.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A sea turtle gracefully swims through the clear turquoise waters of the ocean. Its flippers move in a steady, rhythmic motion as it glides forward. The sun's rays penetrate the water, creating a dappled pattern of light on the turtle's intricately patterned shell. The scene is tranquil, with the gentle currents softly swaying nearby sea plants. Bubbles occasionally rise around the turtle as it swims, adding to the serene underwater atmosphere. The camera smoothly follows the turtle from a side angle, capturing its elegant movements." + }, + { + "prompt_en": "A storm trooper vacuuming the beach.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A stormtrooper, clad in the iconic white armor, is vacuuming the sandy beach. The helmet obscures the face, adding to the robotic appearance, while the armor contrasts sharply against the soft, golden sand. The vacuum cleaner is a sleek, futuristic model, with a long hose extending to the stormtrooper's grip. The sky is overcast, creating an intriguing atmosphere as the stormtrooper moves methodically along the shoreline, leaving smooth paths in the sand. The camera captures this unique scene from a low angle, emphasizing the surreal nature of the moment." + }, + { + "prompt_en": "A panda standing on a surfboard in the ocean in sunset.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A panda stands confidently on a surfboard, riding the gentle waves of the ocean as the sun sets in the background. The golden hues of the sunset reflect off the water, creating a warm and serene atmosphere. The panda's fur appears slightly tussled by the breeze, and its black and white coat contrasts beautifully with the vibrant, colorful sky. The surfboard bobs gracefully with the motion of the waves, and the camera captures the scene from a low angle, moving slightly to emphasize the panda's playful stance on the surfboard." + }, + { + "prompt_en": "An astronaut feeding ducks on a sunny afternoon, reflection from the water.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "An astronaut is on the edge of a pond, feeding ducks on a sunny afternoon. The astronaut is wearing a full space suit, complete with a helmet, which contrasts amusingly with the Earth's bright and serene natural setting. They are reaching out with a gloved hand to sprinkle seeds onto the surface of the water. The ducks eagerly gather around, causing gentle ripples on the pond. The sunlight shines brilliantly, creating shimmering reflections on the water and highlighting the astronauts' suit and the vibrant ducks. The camera pans slowly from the side, capturing the charming and surreal interaction." + }, + { + "prompt_en": "Two pandas discussing an academic paper.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Two pandas are seated side by side at a small wooden table covered with papers. One panda, with a gentle expression, gestures towards a sheet full of graphs and text, while the other panda leans in closer, appearing curious and engaged. The pandas are surrounded by a cozy interior with bookshelves filled with books in the background. The warm lighting creates a scholarly yet intimate atmosphere as they seemingly 'discuss' the contents. The camera slowly pans from left to right, capturing the pandas' expressive faces and the details of the academic papers." + }, + { + "prompt_en": "Sunset time lapse at the beach with moving clouds and colors in the sky.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The camera captures a time-lapse of a breathtaking sunset at the beach. As the sun descends towards the horizon, the sky transforms into a vibrant palette of orange, pink, and purple hues. Clouds drift across the sky, creating a dynamic and ever-changing pattern against the colorful backdrop. The reflection of the dazzling colors dances on the surface of the ocean, enhancing the magical atmosphere. The camera remains steady, focusing on the horizon where the sun dips below, leaving a tranquil and mesmerizing scene." + }, + { + "prompt_en": "A fat rabbit wearing a purple robe walking through a fantasy landscape.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A plump rabbit wearing a luxurious purple robe saunters through an enchanting fantasy landscape. The rabbit's robe, adorned with intricate golden patterns, sways gently with each step. The landscape is dotted with vibrant, oversized mushrooms and luminescent flowers that cast a magical glow. As the rabbit walks towards the camera, a gentle breeze causes the robe to flutter slightly. The camera smoothly follows the rabbit, capturing the whimsical scenery and enhancing the wonder of the journey." + }, + { + "prompt_en": "A koala bear playing piano in the forest.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A whimsical scene unfolds in the forest as a koala bear sits in front of an upright piano. The koala, with its fluffy ears and soft grey fur, uses its paws to press down on the keys, creating a light, playful melody. Surrounding the koala are tall trees and lush greenery, dappled with sunlight filtering through the leaves, casting a harmonious and magical atmosphere. The camera gently circles around the koala, capturing the playful interaction between the animal and the instrument, with occasional close-ups of the koala's expressive face and moving paws." + }, + { + "prompt_en": "An astronaut flying in space.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In the vast expanse of space, an astronaut is effortlessly floating, surrounded by the stars and the distant glow of a nearby planet. The astronaut is clad in a sleek, white space suit that reflects the faint light from the planet, creating a striking contrast against the dark void of space. Gently moving through the zero-gravity environment, the astronaut's visor gleams as he gazes at the universe around him. The camera smoothly orbits around the astronaut, capturing the serene and awe-inspiring beauty of this cosmic journey." + }, + { + "prompt_en": "Fireworks.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The night sky is illuminated with a vibrant display of fireworks, bursting into a multitude of colors and patterns. Brilliant reds, greens, blues, and golds light up the darkness in intricate designs. The camera captures the ascent of the fireworks as they shoot upward, then elegantly pans out to reveal the dazzling explosions that spread across the sky. The sparkling trails slowly fade, leaving a momentary silence before the next set of fireworks bursts into life. The atmosphere is lively and celebratory, bringing energy and excitement to the scene." + }, + { + "prompt_en": "An animated painting of fluffy white clouds moving in sky.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "An animated painting captures fluffy white clouds peacefully drifting across a tranquil blue sky. The clouds are soft and voluminous, with gentle shadows adding depth to their forms. As they move slowly from left to right, the subtle changes in light create a harmonious and serene atmosphere. The artwork has an ethereal quality, with each cloud gracefully changing shape as it floats by. The camera pans smoothly from left to right, following the clouds' journey across the sky." + }, + { + "prompt_en": "Flying through fantasy landscapes.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The camera soars through a series of vibrant fantasy landscapes, each more enchanting than the last. Starting in a lush, green forest where trees tower majestically, the leaves shimmering with an ethereal glow under soft sunlight. The camera smoothly transitions to a sweeping view of towering mountains capped with glistening snow, the peaks bathed in golden light. Next, it glides through a mystical valley filled with colorful flowers that sway gently as if in a dance choreographed by the wind. Finally, the camera ascends towards a sky adorned with floating islands, each carrying lush vegetation and sparkling waterfalls cascading into the clouds, creating a magical and otherworldly atmosphere." + }, + { + "prompt_en": "A bigfoot walking in the snowstorm.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A large, mysterious silhouette, purported to be Bigfoot, trudges through a fierce snowstorm. The creature is covered in dark, shaggy fur that is partially obscured by the swirling snow. The thick snowflakes are whipped about by the wind, creating an intense atmosphere. Bigfoot moves steadily through the deep snow, heading towards the camera, leaving behind a trail of large footprints. The blizzard conditions lend an eerie, otherworldly quality to the snowy landscape. The camera follows Bigfoot's movements, capturing the rugged determination in its stride." + }, + { + "prompt_en": "A squirrel eating a burger.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A lively squirrel is perched on a patch of green grass, holding a small burger with its tiny paws. The squirrel's bushy tail is curled up behind it, and its bright eyes are focused intently on the burger. It eagerly nibbles at the burger, taking small bites with its sharp teeth. Warm sunlight filters through the leaves above, casting dappled shadows on the ground, creating a peaceful and natural atmosphere. The camera slowly zooms in to capture the adorable scene in greater detail." + }, + { + "prompt_en": "A cat wearing sunglasses and working as a lifeguard at a pool.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A quirky scene features a cat donning a pair of oversized sunglasses while perched confidently on a poolside lifeguard chair. The cat, with a luxurious coat and a relaxed demeanor, looks out over the calm pool waters. Around its neck is a small, red lifeguard whistle hanging from a string. Behind the cat, the pool glistens under a bright sunny sky, creating a playful and whimsical atmosphere. The camera slowly zooms in, capturing the cat's laid-back expression and cool sunglasses." + }, + { + "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The scene depicts snow-covered rocky mountain peaks towering over deep canyons that wind through the landscape. The snow creates a pristine white blanket over the rugged terrain, with the peaks casting long shadows across the canyon depths. The canyons twist and bend, weaving through the high-elevated mountain range, creating a dramatic and awe-inspiring vista. The camera provides a sweeping overview of this majestic setting, capturing the interplay of light and shadow as it glides slowly over the landscape, highlighting the grandeur and natural beauty of the snow-covered peaks and canyons." + }, + { + "prompt_en": "Splash of turquoise water in extreme slow motion, alpha channel included.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In extreme slow motion, a splash of vibrant turquoise water is captured, every droplet shimmering against a plain backdrop. The water arcs gracefully, creating intricate patterns as it ascends and descends. The motion is smooth and fluid, emphasizing the elegance of the splash. The droplets separate and glisten, catching the light in captivating ways. An alpha channel is included, isolating the splash and providing crisp edges against any background. The camera remains steady, focusing exclusively on the mesmerizing movement of the water." + }, + { + "prompt_en": "an ice cream is melting on the table.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A single scoop of ice cream, a rich chocolate color, is placed on a wooden table. The ice cream is partially melted, forming a small puddle around the base of the scoop. Soft lighting lends a warm glow to the scene, highlighting the creamy texture. The camera gently circles around the ice cream, focusing on the smooth melting patterns and the wooden texture of the table surface. A hint of sunlight glints off the melting edges, accentuating the dessert's inviting appearance." + }, + { + "prompt_en": "a drone flying over a snowy forest.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A drone is gracefully flying over a vast, snowy forest, capturing the pristine beauty of the landscape below. The snow blankets the tops of tall evergreen trees, and the scene is illuminated by a soft, diffused light creating a serene and tranquil atmosphere. As the drone moves forward, the camera pans slightly to reveal more of the undulating terrain, with gentle hills and valleys. The sparkling snow contrasts beautifully with the deep green of the evergreens, and the overall view is expansive and breathtaking. The drone's smooth and steady movement enhances the peaceful ambiance of the wintery scene." + }, + { + "prompt_en": "a shark is swimming in the ocean.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A sleek and powerful shark is swimming gracefully through the clear blue ocean water. The sunlight filters from above, casting patterns on its streamlined body as it moves steadily forward. The shark's movements are smooth and purposeful, its dorsal fin slicing through the water's surface. Bubbles trail subtly behind as the shark adjusts its course slightly. The camera slowly follows the shark from behind and slightly below, offering a view of its impressive form against the vast ocean backdrop." + }, + { + "prompt_en": "Aerial panoramic video from a drone of a fantasy land.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The aerial video starts with a sweeping panoramic view of a fantasy land, captured from a drone high above. The landscape includes lush, rolling hills with vibrant green grasses, dotted with fantastical, towering trees with golden leaves. In the distance, a magnificent castle made of gleaming white stone rises majestically, its towers reaching towards the sky. The camera glides smoothly across the scene, revealing sparkling streams meandering through the terrain and small, charming villages nestled among the hills. The lighting is ethereal, casting a magical glow across the entire fantasy realm, creating an enchanting and otherworldly atmosphere." + }, + { + "prompt_en": "a teddy bear is swimming in the ocean.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A charming teddy bear floats in the calm ocean, gently bobbing with the rhythm of the waves. The plush teddy is light brown with a soft, cuddly appearance, its arms spread wide as it drifts slightly up and down with the water. Sunlight shimmers on the ocean's surface, creating a serene and playful atmosphere. The camera captures a close-up of the teddy bear from above, focusing on its joyful expression and subtle movements in the soothing water." + }, + { + "prompt_en": "time lapse of sunrise on mars.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The video captures a time-lapse of a sunrise on Mars. The scene begins with a dark Martian landscape, characterized by rugged terrain and dusty, red-hued rocks. As the sun begins to rise over the horizon, it emits a soft, bluish glow that gradually intensifies, illuminating the landscape in a surreal light. The Martian sky transitions from a deep indigo to lighter shades of blue and orange, reflecting off the barren ground. The camera remains still, allowing the viewer to absorb the changing atmosphere and the dramatic shift in lighting over the alien world." + }, + { + "prompt_en": "golden fish swimming in the ocean.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A golden fish gracefully swims in the ocean's blue depths, its scales shimmering in the light filtering down from above. The fish's delicate fins undulate as it glides forward, creating gentle ripples in the water. Small bubbles trail behind as it navigates through the calm, clear waters. The serene ambiance is emphasized by the gentle sway of submerged aquatic plants in the background. The camera smoothly follows the fish, capturing its elegant movements and the tranquil underwater environment." + }, + { + "prompt_en": "An artist brush painting on a canvas close up.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A close-up view captures an artist's hand skillfully guiding a brush on a canvas. The brush, finely tipped, is delicately applying vibrant strokes of blue and green paint. The canvas texture is visible, with the fresh paint glistening slightly in the light. The hand appears steady and focused, with subtle movements as the artist carefully adds details. The lighting is soft, highlighting the colors and emphasizing the artist's careful technique. The camera remains tightly focused on the brush and canvas, creating an intimate and dynamic view of the painting process." + }, + { + "prompt_en": "A drone view of celebration with Christmas tree and fireworks, starry sky - background.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The video opens with a sweeping drone view capturing a festive celebration from above. In the center, a towering, beautifully decorated Christmas tree sparkles with twinkling lights and colorful ornaments. Surrounding the tree are people joyfully gathered, some dancing and others gazing upwards. As the camera slowly moves closer, vibrant fireworks burst into the sky, illuminating the night with brilliant colors. The backdrop reveals a starry sky, adding a magical feel to the scene. Soft ambient lighting from the celebration below enhances the festive atmosphere." + }, + { + "prompt_en": "happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A cheerful dog sits facing the camera in a studio setting, wearing an adorable yellow turtleneck that accentuates its joyful expression. The dog's eyes are bright and lively, and its mouth is open in what looks like a happy smile, creating an endearing portrait. The dark background contrasts beautifully with the bright color of the turtleneck, highlighting the dog's fur and features. The camera keeps a steady focus on the dog's face, capturing the warmth and personality in its expression." + }, + { + "prompt_en": "Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a studio setting with a minimalist, white background, a group of intricately crafted origami dancers made from white paper is performing a modern dance. The delicate folds of the paper create elegant and fluid forms, suggesting dynamic movement as the dancers strike poses in mid-motion. The lighting is soft, accentuating the shadows and textures of the origami figures. The camera pans slowly, capturing the poetic grace of the paper dancers in a gentle, rhythmic motion." + }, + { + "prompt_en": "Campfire at night in a snowy forest with starry sky in the background.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a snowy forest under a starry night sky, a warm campfire crackles and glows brightly amidst the surrounding snow-covered trees. The orange flames flicker and illuminate the immediate area, casting dancing shadows on the pristine white snow. The stars twinkle above, adding to the serene and magical atmosphere. The scene is viewed with a gentle camera pan, capturing the vast expanse of the starry sky above and the cozy warmth of the campfire in the foreground." + }, + { + "prompt_en": "a fantasy landscape", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The scene opens with a sweeping view of a fantasy landscape, bathed in the soft, ethereal glow of a twilight sky. Tall, luminous trees with shimmering foliage create a magical atmosphere, their branches stretching out gracefully. In the background, towering, fantastical mountains rise, topped with glowing snowcaps that reflect the sky's radiant hues. A serene, sparkling river meanders through the landscape, reflecting the colors of the sky. The camera gently pans across the scene, capturing the otherworldly beauty and tranquility of this enchanted realm." + }, + { + "prompt_en": "A 3D model of a 1800s victorian house.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A 3D model showcases a grand 1800s Victorian house. The house features intricate architectural details, with ornate gabled roofs and elaborate wooden trims. The facade is adorned with bay windows, giving depth and character to the structure. The front porch is supported by decorative columns and surrounded by a wrought-iron railing. The model captures the essence of the era, with its high-pitched roofs and intricately designed exterior, all set in a soft, ambient lighting that highlights its historical charm. The camera smoothly zooms in, slowly revealing the exquisite details of the house's design." + }, + { + "prompt_en": "this is how I do makeup in the morning.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A charming and beautiful woman in her mid-20s is seated at a well-lit vanity mirror, surrounded by various makeup products. She gracefully applies foundation with a brush, focusing on achieving an even and flawless complexion. Her long, shiny hair is tucked behind her ears to keep it out of her face. The camera captures her reflection in the mirror, providing a close-up view of her careful technique. The soft, ambient lighting creates a warm and inviting atmosphere as she continues her morning makeup routine." + }, + { + "prompt_en": "A raccoon that looks like a turtle, digital art.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In this digital art piece, a raccoon with features resembling a turtle is depicted in a whimsical and imaginative style. The raccoon has a turtle-like shell on its back, adorned with intricate patterns and shaded in green and brown hues. Its fur maintains the classic raccoon colors, with grey and black stripes on its face and bushy tail. The setting is a lush forest, with vibrant foliage creating a vivid backdrop. The lighting is soft and highlights the raccoon's unique features, accentuating the artistic blend of raccoon and turtle traits. The camera view is slightly angled from the side to reveal both the raccoon's face and the shell, showcasing the creative melding of these two creatures." + }, + { + "prompt_en": "Robot dancing in Times Square.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A humanoid robot is dancing energetically in the iconic Times Square, surrounded by the vibrant lights of towering digital billboards. The robotic figure, with a sleek metallic body and glowing blue accents, performs smooth and rhythmic dance moves, capturing the attention of bystanders. The scene is lively, with bustling crowds and bright neon colors reflecting off the robot's shiny surface. The camera circles around the robot, emphasizing its dynamic moves against the backdrop of the bustling urban environment." + }, + { + "prompt_en": "Busy freeway at night.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A busy freeway at night is seen from a high vantage point, showcasing a stream of vehicle headlights and taillights. The bright, white headlights of cars coming towards the camera contrast with the red taillights of vehicles moving away. The scene is illuminated by streetlights, casting a faint golden glow over the freeway. The overall atmosphere is bustling and dynamic, with cars weaving slightly as they maintain their paths in the various lanes. The camera smoothly pans along the freeway, capturing the continuous flow of traffic." + }, + { + "prompt_en": "Balloon full of water exploding in extreme slow motion.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In this dramatic scene, a water-filled balloon is suspended in the air, ready to explode in extreme slow motion. The balloon's smooth and glossy surface is fully stretched, capturing the light and casting a subtle reflection. When the balloon bursts, water erupts in a mesmerizing display, creating a spherical formation that radiates outward. Droplets and streams of water are clearly visible, floating in the air before gracefully falling. The background is softly blurred to keep the focus on the intricate patterns of the exploding water, while the lighting highlights the transparency and shimmer of the droplets. The camera remains steady, providing a clear and captivating view of the entire explosion." + }, + { + "prompt_en": "An astronaut is riding a horse in the space in a photorealistic style.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a surreal and photorealistic scene set against the vast expanse of space, an astronaut in a pristine white spacesuit is riding a sleek, majestic horse. The spacesuit features a reflective visor and intricate details, and the horse's mane flows gracefully as it trots forward. Stars twinkle in the background, and a distant planet adds depth to the cosmic setting. The scene is beautifully illuminated, with faint light highlighting the astronaut and horse, creating a mesmerizing and otherworldly atmosphere. The camera captures this unique spectacle at a steady angle, showcasing the elegant and unexpected pairing against the infinite universe." + }, + { + "prompt_en": "Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a macro slow-motion shot, roasted coffee beans gently tumble into an empty ceramic bowl. The camera focuses closely on the rich, textured surface of the dark brown beans, capturing their glossy sheen and small surface imperfections. As the beans cascade downwards, they create a soft, cascading sound, with each bean clinking against the bowl's surface in a rhythmic manner. The lighting is warm, emphasizing the natural deep and rich colors of the beans, and creating a cozy, aromatic atmosphere." + }, + { + "prompt_en": "Sewing machine, old sewing machine working.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "An antique sewing machine is in operation, with its intricate metalwork and polished wooden base gleaming under soft, warm lighting. The needle swiftly pierces through a piece of fabric, creating a rhythmic movement as the machine's wheel turns smoothly. The camera pans closer to capture the precise motion of the needle and the fabric feeding through the guiding mechanism. The machine's vintage charm is highlighted by its ornate design, adding a touch of nostalgia to the scene." + }, + { + "prompt_en": "Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A vibrant cloud of colorful ink slowly disperses in clear water, creating a mesmerizing abstract display. The ink drops swirl and intertwine, forming intricate patterns that resemble a dreamlike cloudscape. The colors are rich and varied, including deep blues, vivid reds, and bright yellows, blending harmoniously as they expand and spread. The water captures the motion beautifully, with the swirling ink creating soft, flowing shapes. The scene is elegantly lit, highlighting the delicate details and the fluidity of the ink's movement. The camera gently pans around the scene to showcase the dynamic and graceful evolution of the ink cloud." + }, + { + "prompt_en": "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A few large purple plums are rotating slowly on a white turntable, perfectly isolated against a white background. As they turn, water droplets begin to form on their smooth, vibrant skin, adding a fresh and juicy appearance. The camera captures a close-up, macro view, highlighting the rich color and texture of the plums, which glisten under bright lighting. As the turntable continues to rotate, the plums reveal their roundness and the play of light on their surfaces. The camera maintains a steady focus on the plums, enhancing their allure and freshness." + }, + { + "prompt_en": "Vampire makeup face of beautiful girl, red contact lenses.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A strikingly beautiful girl is seen with dramatic vampire makeup. Her skin is pale, creating a stark contrast with her deep red contact lenses that give her an intense and captivating gaze. Her lips are painted a bold, deep crimson, enhancing her alluring vampire look. Her hair is sleek and dark, framing her face perfectly. The lighting is dim and moody, highlighting her features and adding an air of mystery to her appearance. The camera slowly pans closer to her face, emphasizing her enchanting and supernatural look." + }, + { + "prompt_en": "Ashtray full of butts on table, smoke flowing on black background, close-up", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A close-up view captures an ashtray overflowing with cigarette butts, resting on a dark table. Wisps of smoke gracefully rise and swirl against a solid black background, creating a sense of motion and atmosphere. The ashtray is detailed with a metallic sheen, reflecting light from the smoldering embers of the cigarette remnants. The scene is illuminated with soft, ambient lighting, highlighting the textures of the ashtray and the swirling smoke. The camera remains steady, focusing on the interplay between the smoke and the surrounding darkness." + }, + { + "prompt_en": "Pacific coast, carmel by the sea ocean and waves.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The scene captures the picturesque Pacific coast at Carmel-by-the-Sea, where the ocean waves are rhythmically crashing against the rocky shoreline. The camera slowly pans from left to right, revealing the diverse textures of the coastline, with rugged rocks and patches of sandy beach. The sunlight glistens on the surface of the water, creating shimmering reflections that dance with the movement of the waves. In the distance, the horizon meets the expansive, deep blue ocean, offering a serene and tranquil atmosphere." + }, + { + "prompt_en": "A teddy bear is playing drum kit in NYC Times Square.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In the bustling and brightly lit Times Square of NYC, a charming teddy bear is sitting at a drum kit. The teddy bear has soft, fluffy brown fur and wears a small, stylish hat tilted slightly to the side. It holds drumsticks in its small paws, rhythmically tapping on the drum set, which is scaled to its size. The bright LED billboards and lively crowd create a vibrant and energetic atmosphere, amplifying the teddy bear's playful performance. The camera slightly pans around, capturing the bear's enthusiastic drumming against the backdrop of iconic Times Square lights." + }, + { + "prompt_en": "A corgi is playing drum kit.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A charming corgi is enthusiastically playing a small drum kit. The corgi stands on its hind legs, using its front paws to tap the drums and cymbals energetically. The drum kit consists of a bass drum, snare drum, and a couple of cymbals, all proportionate to the corgi's size. The corgi's fluffy fur and lively ears make for an adorable sight as it joyfully plays the kit. The scene is bright and cheerful, with studio lighting highlighting the corgi's playful performance. The camera captures the scene with a steady shot that focuses on the corgi's animated drumming." + }, + { + "prompt_en": "An Iron man is playing the electronic guitar, high electronic guitar.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "An Iron Man statue is depicted with a realistic metallic sheen, standing tall and posed in an action-oriented position, holding an electric guitar. The guitar boasts a sleek design and vibrant colors, with its body angled upward as if Iron Man is mid-performance, strumming intensely. The setting is energetic, with bright stage lights casting vivid reflections off the metallic surfaces, creating a dynamic and futuristic atmosphere. The camera captures Iron Man from a low angle, emphasizing the statue's imposing presence and lively performance." + }, + { + "prompt_en": "A raccoon is playing the electronic guitar.", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A raccoon is seated upright, strumming an electronic guitar skillfully. The raccoon has fluffy gray fur and distinctive black rings around its eyes, adding to its endearing appearance. Its small paws move adeptly over the guitar strings, creating a sense of lively rhythm. The guitar is bright red with a glossy finish, standing out against the raccoon's fur. The background features a softly lit stage with colorful spotlights casting a dynamic ambiance, enhancing the playful and musical atmosphere of the scene. The camera angle is centered on the raccoon, capturing its animated expression as it enjoys playing the guitar." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A vibrant, impressionistic scene depicts a small boat sailing leisurely along the Seine River. The water is depicted with swirling blues and greens, capturing the fluidity and movement of the river. The iconic Eiffel Tower stands tall in the background, painted in bold, expressive strokes. The sky is depicted with Van Gogh’s signature style—whirling clouds and dynamic brushwork in shades of yellow and orange suggest a serene, dreamy atmosphere. The camera smoothly pans across the scene, enhancing the painting’s lively character and bringing the viewer closer to the boat as it glides gently through the water." + }, + { + "prompt_en": "A corgi's head depicted as an explosion of a nebula", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A vibrant and imaginative scene shows a corgi's head transformed into an explosion resembling a nebula. The nebula features swirling, colorful gases and cosmic dust, with brilliant hues of blue, purple, and pink radiating outward. The corgi's facial features are subtly integrated into the celestial pattern, with its charming eyes and distinctive ears barely visible within the swirling nebula. The impression is both dreamlike and surreal, with stars scattered throughout, adding a mystical and otherworldly atmosphere to the scene. The camera slowly pans around this cosmic image, enhancing the sense of wonder and vastness." + }, + { + "prompt_en": "A fantasy landscape", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A breathtaking fantasy landscape unfolds, showcasing a series of towering, otherworldly mountains with jagged peaks reaching into the sky. The foreground features a shimmering lake with crystal-clear water reflecting the colors of the sky, which is a mix of vibrant purples and deep blues, speckled with twinkling stars. In the distance, a pair of glowing orbs hover in the air, casting a mystical light across the landscape. The scene is bathed in a magical, ethereal glow, creating an enchanting and surreal atmosphere as the camera slowly pans across the landscape, giving viewers a full view of this mesmerizing world." + }, + { + "prompt_en": "A future where humans have achieved teleportation technology", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a sleek, ultra-modern room bathed in soft blue ambient lighting, a teleportation booth stands at the center, exuding a futuristic aura. The booth is a cylindrical structure made of transparent panels with glowing circuits tracing intricate patterns across its surface. Inside, a middle-aged man with distinguished features and wearing a streamlined, futuristic outfit prepares for his teleportation. His expression is calm and composed as he interacts with a digital panel beside the booth. As the camera circles around, the man takes a deep breath, and the booth emits a gentle hum, indicating the initiation of the teleportation process. The atmosphere is serene yet filled with anticipation." + }, + { + "prompt_en": "A jellyfish floating through the ocean, with bioluminescent tentacles", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A stunning jellyfish floats gracefully through the deep ocean, its bioluminescent tentacles trailing behind and casting an ethereal glow in the dark water. The jellyfish's translucent, bell-shaped body pulses gently as it propels itself forward, creating a mesmerizing scene. The tentacles emit a soft, mesmerizing light, providing a serene and magical atmosphere in the underwater environment. The camera smoothly follows the jellyfish, capturing its fluid movements and the interplay of light and shadow dancing in the ocean depths." + }, + { + "prompt_en": "A Mars rover moving on Mars", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A Mars rover is seen traversing the rugged, reddish terrain of the Martian surface. The rover's wheels carefully navigate the rocky ground, creating small trails of dust behind as it moves forward. The machine is equipped with several instruments on its body, including a camera mounted on a mast that slowly pans to survey the surrounding landscape. Overhead, the sky has a pale, dusty hue, typical of Mars' thin atmosphere, adding to the vast, alien ambiance of the scene as the rover explores this distant world." + }, + { + "prompt_en": "A panda drinking coffee in a cafe in Paris", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a charming Parisian cafe, a panda is sitting at a small round table. The panda delicately holds a coffee cup with both paws, taking a sip of the steaming beverage. The cafe's ambiance is cozy and inviting, with soft lighting creating a warm atmosphere. The table is adorned with a small vase containing a single red rose, adding a touch of elegance. In the background, there are large windows revealing a view of a bustling Paris street, with people walking by and the iconic Eiffel Tower faintly visible in the distance." + }, + { + "prompt_en": "A space shuttle launching into orbit, with flames and smoke billowing out from the engines", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The scene captures a space shuttle in the early stages of launching into orbit. The shuttle's engines roar to life, producing intense flames and billowing clouds of smoke. The flames glow a vivid orange, contrasting against the crisp blue sky. The camera starts with a wide-angle view, then smoothly zooms in slightly to focus on the intense action at the base of the shuttle. The powerful force generated by the engines causes the surrounding smoke to swirl and dissipate rapidly as the shuttle begins its ascent. The atmosphere is filled with tension and excitement, highlighting the raw power of the launch." + }, + { + "prompt_en": "A steam train moving on a mountainside", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A steam train is chugging along a winding track carved into the mountainside. Thick plumes of white steam rise from its chimney, catching the sunlight as they drift upward. The train's shiny black engine, adorned with brass details, leads a series of vintage carriages painted in deep red. As the camera follows the train's steady progress, the dramatic landscape of the steep mountain cliffs and dense green forests unfolds around it. The camera angle shifts to emphasize the train's journey against this majestic backdrop, enhancing the sense of movement and grandeur." + }, + { + "prompt_en": "A super cool giant robot in Cyberpunk Beijing", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A super cool giant robot stands towering over the vibrant, neon-lit skyline of Cyberpunk Beijing. The robot's metallic body gleams with intricate circuits and panels, pulsing with bright LED lights of blue and purple. As the robot turns its head slowly, its piercing eyes glow with an intense white light, adding to its imposing presence. The cityscape in the background is awash with multicolored neon signs and towering skyscrapers, creating a futuristic and bustling atmosphere. The camera tilts upwards to capture the full height of the robot against the night sky, emphasizing its massive scale and advanced design." + }, + { + "prompt_en": "A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The camera captures the breathtaking view of a tropical beach at sunrise. Tall, gently swaying palm trees gracefully frame the scene, their silhouettes accentuated by the warm, golden light of the rising sun. The crystal-clear water glistens in the foreground, reflecting the vibrant hues of orange and pink painted across the sky. The tranquil atmosphere is enhanced by the gentle lapping of small waves against the shore. The camera slowly pans from left to right, capturing the serene beauty of this idyllic setting." + }, + { + "prompt_en": "Cinematic shot of Van Gogh's selfie, Van Gogh style", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The video opens with a cinematic shot of a painting styled in Van Gogh's signature technique, featuring bold brushstrokes and vibrant colors. The painting depicts Van Gogh himself, with tousled auburn hair and an intense gaze. He is dressed in a blue coat, which contrasts with the swirling background of deep blues and yellows. The camera slowly zooms in on his expressive face, highlighting the texture and depth of the brushwork. The lighting emphasizes the rich colors and intricate details of the artwork, creating a captivating and artistic atmosphere." + }, + { + "prompt_en": "Gwen Stacy reading a book", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Gwen Stacy is seated comfortably in a stylish, plush armchair, deeply engrossed in a book. She is a beautiful young woman with striking features and long, blonde hair cascading over her shoulders. Gwen is wearing a chic blouse and a pair of jeans. Her expression is one of focused concentration, with her eyes attentively following the lines on the page. The room is softly lit, creating a warm and inviting atmosphere, while the camera gently zooms in to capture the serenity of the moment." + }, + { + "prompt_en": "Iron Man flying in the sky", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Iron Man is soaring through a clear blue sky, with the sun reflecting off his shiny red and gold armor. As he flies, his repulsors create a trail of light behind him, highlighting his sleek and aerodynamic form. The camera follows him from behind, capturing the dynamic motion of his flight as he maneuvers gracefully through the air. Occasionally, Iron Man performs agile spins and loops, demonstrating his advanced suit capabilities. The scene is vibrant and energetic, conveying the thrill of high-speed flight." + }, + { + "prompt_en": "The bund Shanghai, oil painting", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "An oil painting capturing the scenic beauty of The Bund in Shanghai. The painting portrays the sweeping skyline with its mix of historic and modern architecture along the waterfront. The brushwork gives the buildings an elegant and detailed texture, with splashes of color reflecting the early evening light. The Huangpu River is depicted in flowing strokes, adding movement and life to the scene. The sky above is adorned with warm hues of sunset, gradually transitioning to deeper colors, setting a serene and atmospheric backdrop. The perspective leans slightly upward, offering an expansive view of this iconic area." + }, + { + "prompt_en": "Yoda playing guitar on the stage", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Yoda, the legendary Jedi Master, is standing on a stage, expertly strumming an electric guitar. The stage is illuminated with colorful lights, casting vibrant hues across the scene. Yoda, with his distinctive green skin and large, expressive eyes, is wearing his classic brown robe. Despite his petite stature, he holds the guitar confidently, fingers skillfully moving over the strings. His expression exudes both concentration and enjoyment. The camera angle captures him from the front, with the stage lights creating an energetic ambiance." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The scene captures a beautiful coastal beach in spring, depicted in the iconic Ukiyo-e style reminiscent of Hokusai. The waves gently lap onto the shore, their rhythmic movement creating a serene and tranquil atmosphere. The delicate interplay of blue and white in the waves reflects the crafted brushwork characteristic of the Ukiyo-e style. The sandy beach is dotted with small blossoms, adding a touch of spring's vibrant colors to the scene. In the distance, rolling hills and a clear sky enhance the composition, evoking a sense of calm and harmony. The camera pans slowly from left to right, showcasing the entire picturesque landscape." + }, + { + "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Imagine a painting inspired by Vincent van Gogh, depicting a beautiful coastal beach in spring. The scene is vibrant and full of life, with rhythmic waves gently lapping onto the golden sand. The waves are painted with Van Gogh's signature swirling brushstrokes, capturing the dynamic energy of the ocean. The sky above is a clear blue, and the sunlight casts a warm, golden hue across the beach. Delicate flowers in colorful blooms can be seen dotting the edges of the sandy landscape, adding to the spring atmosphere. The painting encapsulates the tranquil yet lively essence of the beach, using vivid colors and expressive strokes." + }, + { + "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A small boat is seen sailing leisurely along the Seine River, moving from left to right in the frame with the iconic Eiffel Tower standing tall in the background. The setting sun bathes the scene in a golden hue, adding a romantic atmosphere. The boat gently glides through the calm waters, creating soft ripples that spread outwards. The Eiffel Tower, partially obscured by a few nearby trees, is silhouetted against the vibrant sky. The camera slowly pans right, enhancing the serene movement of the boat and the breathtaking view." + }, + { + "prompt_en": "A car moving slowly on an empty street, rainy evening", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "On a rainy evening, a car is moving slowly along an empty street, with its headlights cutting through the misty environment. Raindrops glisten on the car's surface, illuminated by the soft glow of streetlights lining the road. The windshield wipers methodically sweep away the raindrops, revealing the reflections of the lights on the wet pavement. The camera gently pans to follow the car as it makes its way along the slick, deserted street, capturing the tranquil, almost cinematic atmosphere of the scene." + }, + { + "prompt_en": "A cat eating food out of a bowl", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A fluffy cat crouches near a small bowl placed on a kitchen floor, eating its meal. Its fur is predominantly white, with soft gray patches accentuating its fluffy tail and ears. The cat's bright green eyes glance around as it takes a bite from the bowl filled with cat food. The lighting is warm, casting gentle shadows across the kitchen tiles. The camera moves slightly, capturing different angles of the cat enjoying its meal." + }, + { + "prompt_en": "A cat wearing sunglasses at a pool", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A charming scene unfolds by the poolside with a stylish cat wearing oversized sunglasses, comfortably lounging on a vibrant striped towel. The cat's fur is sleek and well-groomed, adding to its cool persona. The sunglasses' reflective lenses catch the light, making the cat appear even more fashionable. Nearby, the bright surface of the pool glistens under the sun, creating a playful sparkle. The camera slowly pans from the shimmering pool water to focus on the cat, highlighting its relaxed demeanor in a sunny and cheerful atmosphere." + }, + { + "prompt_en": "A confused panda in calculus class", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a lighthearted classroom setting, a panda is seated at a desk with an open calculus textbook in front of it. The panda appears puzzled, its head slightly tilted and eyes wide as it looks at the complex equations on the page. The classroom has a few desks, a chalkboard filled with calculus equations in the background, and soft natural light coming through the windows, creating a warm and inviting atmosphere. The panda's adorable expression of confusion adds charm to the scene." + }, + { + "prompt_en": "A cute fluffy panda eating Chinese food in a restaurant", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a cozy, softly lit restaurant setting, a cute and fluffy panda is seated at a small, round table. The panda, with its distinctive black and white fur, is gently holding a pair of chopsticks in its paws. On the table in front of the panda is an array of colorful Chinese food dishes, including steaming dumplings and vibrant stir-fried vegetables. The panda leans forward slightly, its eyes wide with delight as it selects a dumpling with the chopsticks. The background is filled with warm wooden decor and traditional Chinese lanterns, enhancing the cozy and inviting atmosphere. The camera smoothly pans in to capture the panda's endearing expression of contentment." + }, + { + "prompt_en": "A cute happy Corgi playing in park, sunset", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "During sunset in a vibrant park setting, a cute and happy Corgi joyfully plays on a lush green lawn. The Corgi's fluffy coat glistens in the golden light, and its energetic tail wags enthusiastically as it runs around in circles. The dog's ears perk up as it pauses to sniff the air, capturing the playful and carefree essence of the moment. The camera smoothly follows the Corgi's playful movements, creating a warm and lively atmosphere with the sun low in the sky, casting long, soft shadows across the grass." + }, + { + "prompt_en": "A cute raccoon playing guitar in a boat on the ocean", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A cute raccoon is sitting upright in a small wooden boat, gently rocking on the ocean waves. The raccoon, with its fluffy striped tail and expressive eyes, is strumming a miniature guitar with its tiny paws. Sunlight sparkles on the surface of the water, creating a serene and pleasant atmosphere. The camera slowly circles around the boat, capturing the endearing scene from different angles as the rhythmic sounds of the ocean accompany the raccoon's playful guitar performance." + }, + { + "prompt_en": "A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a tranquil outdoor setting, a happy, fuzzy panda sits near a crackling campfire, playing an acoustic guitar. The panda, with its soft, fluffy fur and distinctive black and white markings, strums the guitar joyfully, generating a relaxed and cheerful ambiance. Behind the panda, a majestic snow-capped mountain towers against a clear blue sky, adding a breathtaking natural backdrop. The warm glow of the campfire flickers and dances across the panda's fur, contributing to a cozy and serene atmosphere. The camera captures the scene with a gentle, panning motion, highlighting the harmony between the panda, the fire, and the stunning landscape." + }, + { + "prompt_en": "A lightning striking atop of eiffel tower, dark clouds in the sky", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a dramatic night scene, the Eiffel Tower stands prominently under a sky filled with dark, swirling clouds. Suddenly, a brilliant bolt of lightning pierces the sky, striking the tip of the iconic tower. The lightning illuminates the surrounding clouds, casting sharp, electrifying shadows over the Paris skyline. The tower itself glows momentarily in the intense, pulsating light of the strike, creating a thrilling and atmospheric visual. The camera captures the scene with a slight zooming movement to emphasize the power and scale of the lightning strike against the towering structure." + }, + { + "prompt_en": "A modern art museum, with colorful paintings", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Inside a modern art museum, the camera pans across a room showcasing a diverse collection of colorful paintings. The walls are adorned with vibrant, abstract artworks featuring bold brushstrokes and a spectrum of colors. Each painting varies in size and style, adding to the eclectic ambiance of the space. Soft, overhead lighting casts a gentle illumination on the art, highlighting the textures and details in each piece. The camera smoothly moves along the room, providing a captivating view of the artistic display." + }, + { + "prompt_en": "A panda cooking in the kitchen", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a cozy kitchen setting, an adorable panda is standing by the stove, stirring a pot with a wooden spoon. The panda, with its distinctive black and white fur pattern, appears focused on the task with its large, expressive eyes. The kitchen has warm lighting, and there are various cooking utensils and ingredients neatly arranged on the counter. Steam gently rises from the pot, creating a comforting and homely atmosphere. The camera captures a side view of the panda, highlighting its charming features and the lively scene." + }, + { + "prompt_en": "A panda playing on a swing set", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a playful scene, a panda is seen enjoying itself on a swing set. The panda is swinging back and forth, gripping the chains with its front paws while sitting comfortably on the swing. Its dark, round eyes and big, fluffy ears shine in the sunlight. The swing moves gently, creating a lively atmosphere with the greenery of a bamboo-filled background. The panda looks joyous, occasionally swaying its legs as it swings towards the camera, then away, adding a charming sense of motion to the scene." + }, + { + "prompt_en": "A polar bear is playing guitar", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A polar bear sits comfortably upright in a snowy landscape, skillfully strumming a guitar with its paws. The bear's thick white fur contrasts against the vivid colors of the guitar, creating a whimsical scene. The sky above is a soft blue, and snowflakes gently fall around the bear, adding a touch of magic to the atmosphere. The camera slowly pans around the bear, capturing different angles of this playful moment." + }, + { + "prompt_en": "A raccoon dressed in suit playing the trumpet, stage background", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A raccoon dressed in a sleek, well-tailored suit stands on a stage with a spotlight focused on him, casting a dramatic shadow. The raccoon's fur is neatly groomed, and the suit enhances his charming appearance. He holds a shiny brass trumpet to his lips, with his small paws deftly maneuvering the instrument. The stage background features rich, royal blue curtains that add an elegant touch to the scene. The raccoon's eyes are closed, immersed in the melody as he plays, evoking a lively and engaging atmosphere. The camera slowly pans from left to right, capturing the raccoon's performance and the ambient glow of the stage lighting." + }, + { + "prompt_en": "A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A robot DJ is stationed on a futuristic Tokyo rooftop, skillfully playing the turntable amidst a heavy rainstorm. The setting is drenched in a cyberpunk atmosphere, with neon lights from surrounding skyscrapers reflecting off the wet surfaces. The robot DJ has a sleek, metallic design, with illuminated circuits glowing in vibrant colors. Its mechanical arms expertly manipulate the turntable, blending sci-fi beats that merge with the rhythmic sound of rain. The camera circles around the robot, capturing the urban skyline as lightning occasionally illuminates the scene, enhancing the fantastical and high-tech ambiance." + }, + { + "prompt_en": "A shark swimming in clear Caribbean ocean", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A sleek and impressive shark glides effortlessly through the crystal-clear turquoise waters of the Caribbean Ocean. The sunlight penetrates the water, creating shimmering patterns on the sand below. The shark's powerful body and streamlined fins cut gracefully through the water, as it swims parallel to the ocean floor. The clarity of the water highlights the intricate details of the shark's skin and movement. The camera follows the shark from a slightly elevated angle, capturing its smooth and elegant trajectory through the serene seascape." + }, + { + "prompt_en": "A super robot protecting city", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A massive super robot stands tall amidst the cityscape, its metallic armor gleaming in the daylight. The robot's impressive form is equipped with intricate details, including reinforced joints, a glowing core in its chest, and bright blue energy lines running along its frame. The robot gazes over the city with an authoritative presence, its posture strong and vigilant. Behind the robot, the city skyline stretches out with skyscrapers and buildings. A gentle breeze sways in the environment, adding a dynamic feel to the scene. The camera slowly pans from the ground up, capturing the robot's entire stature against the backdrop of the city." + }, + { + "prompt_en": "A teddy bear washing the dishes", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a cozy kitchen setting, a charming teddy bear stands at a kitchen sink, washing dishes. The teddy bear has soft brown fur and a cute, friendly expression. It's wearing a small apron tied around its waist. The teddy bear holds a sponge in one of its paws, cleaning a plate under a gentle stream of water from the faucet. Bubbles and suds fill the sink, and a few clean dishes are neatly stacked to the side. The atmosphere is warm and inviting, with sunlight filtering in through a nearby window. The camera shows a gentle side-to-side movement to capture the teddy bear's adorable kitchen routine." + }, + { + "prompt_en": "An epic tornado attacking above a glowing city at night, the tornado is made of smoke", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "In a dramatic nighttime scene, a colossal tornado composed of swirling smoke moves ominously above a glowing cityscape. The city, illuminated by countless twinkling lights, contrasts starkly with the dark, tumultuous sky. The tornado's smoke spirals violently, rising from the lower part of the frame towards the heavens. As it advances, it casts eerie shadows over the buildings below, and flashes of lightning occasionally illuminate the smoke, adding an otherworldly atmosphere. The camera angle gives a slightly upward view, emphasizing the tornado's immense and destructive power." + }, + { + "prompt_en": "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "An oil painting depicts a sophisticated couple dressed in elegant formal evening wear, caught in a heavy downpour on their way home. The man, handsome and poised, wears a classic black tuxedo with a crisp white shirt, holding a large black umbrella above them. The woman, charming and graceful, dons a stunning long evening gown that shimmers in the dim light of the streetlamps. Her hair is styled elegantly, and she clutches a small clutch purse. Rain cascades down in heavy sheets, creating a sense of urgency and movement in the scene. The street glistens with reflections of the couple and the surrounding lights, capturing the romantic and dramatic atmosphere of the rainy night. The brushstrokes convey the texture of the rain and the couple's clothing, adding depth and emotion to the painting." + }, + { + "prompt_en": "Clown fish swimming through the coral reef", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A vibrant clownfish gracefully swims amidst a lively coral reef, its orange and white stripes contrasting beautifully with the colorful corals surrounding it. The fish navigates through the delicate maze of corals with gentle, fluid movements, occasionally pausing to explore the intricate textures and vivid colors of the reef. The sun's rays pierce the water, creating a shimmering effect that enhances the lively atmosphere underwater. The camera gently follows the clownfish, capturing the serene and mesmerizing environment of the reef." + }, + { + "prompt_en": "Hyper-realistic spaceship landing on Mars", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A hyper-realistic spaceship descends towards the rusty, red surface of Mars. The vessel, sleek and metallic, features intricate detailing and panels reflecting the soft golden light of the Martian sun. As it approaches the ground, dust begins to swirl below, kicked up by the ship's thrusters. The camera follows the spaceship closely, capturing its gradual and steady descent until it touches down gently on the rocky Martian terrain. In the background, the expansive Martian landscape stretches out, with reddish-brown hills and a clear, stark sky, enhancing the dramatic and otherworldly atmosphere of the scene." + }, + { + "prompt_en": "The bund Shanghai, vibrant color", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "The video captures the iconic view of The Bund in Shanghai, showcasing the city's vibrant colors. The camera moves smoothly across the bustling waterfront promenade where the shimmering lights of decorative buildings reflect off the Huangpu River, creating a dynamic cityscape. In the foreground, a few people can be seen strolling along the walkway, silhouetted against the brightly illuminated skyline. The atmosphere is lively, with the evening lights casting a colorful glow over the historic and modern architecture that makes The Bund a fascinating visual experience." + }, + { + "prompt_en": "Vincent van Gogh is painting in the room", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "Vincent van Gogh, depicted as a handsome man with a distinct red beard and intense, thoughtful eyes, is standing in a rustic, well-lit room. He wears a blue smock with paint smudges and holds a palette filled with vibrant colors in one hand. In his other hand, he wields a paintbrush, poised to add details to a canvas on an easel before him. The room is filled with natural light streaming through a window, casting soft shadows and creating a warm, creative atmosphere. The camera gently pans around Vincent, capturing his focused expression and the brushstrokes on his canvas." + }, + { + "prompt_en": "Yellow flowers swing in the wind", + "dimension": [ + "overall_consistency", + "aesthetic_quality", + "imaging_quality" + ], + "refined_prompt": "A cluster of vibrant yellow flowers is gently swaying in the wind, set against a backdrop of lush green foliage. The petals of the flowers catch the sunlight, creating a warm and inviting atmosphere. A gentle breeze causes them to sway rhythmically, and their shadows dance lightly across the surrounding leaves. The camera captures a subtle tilt and pan movement, following the flow of the flowers as they move, enhancing the tranquil and serene mood of the scene." + }, + { + "prompt_en": "alley", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "alley" + } + } + }, + "refined_prompt": "A narrow alleyway is captured from a straight-on view, showcasing its charming urban character. The alley is lined with brick walls on both sides, adorned with random patches of graffiti, giving it an artistic flair. As the camera pans slowly down the alley, the dim lighting creates an atmospheric ambiance, with soft shadows cast by overhead lanterns. A gentle breeze causes a few scattered leaves to dance across the ground, enhancing the scene's urban tranquility." + }, + { + "prompt_en": "amusement park", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "amusement park" + } + } + }, + "refined_prompt": "An amusement park scene showcases a vibrant and bustling environment with various attractions. A towering Ferris wheel is the centerpiece, its colorful cabins slowly rotating against a clear blue sky. Nearby, a carousel with brightly painted horses spins gently, the music adding to the lively atmosphere. People of all ages stroll through the park, some holding balloons, others eating cotton candy. The camera captures the scene from a slightly elevated angle, panning slowly across the park to encapsulate the lively and joyful ambiance." + }, + { + "prompt_en": "aquarium", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "aquarium" + } + } + }, + "refined_prompt": "In an aquarium setting, the camera captures a mesmerizing aquatic scene. Brightly colored tropical fish of various shapes and sizes swim gracefully through clear blue waters. Vibrant coral formations and green underwater plants are arranged among rocks on the aquarium floor, creating a lively and natural environment. The camera takes a gentle, sweeping motion, capturing the harmony of the marine life, highlighting the elegant movement of the fish as they dart and glide through their peaceful habitat." + }, + { + "prompt_en": "arch", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "arch" + } + } + }, + "refined_prompt": "A striking stone arch stands in a serene natural setting, surrounded by lush greenery and illuminated by the warm glow of the afternoon sun. The arch is meticulously crafted, with its intricate patterns and textures highlighted by the sunlight. The camera gently moves around the structure, showcasing its graceful curves and the contrast between the rugged stone and the soft foliage. The atmosphere is tranquil and peaceful, with the arch serving as a captivating focal point in this picturesque environment." + }, + { + "prompt_en": "art gallery", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "art gallery" + } + } + }, + "refined_prompt": "An elegant art gallery interior is displayed with clean, white walls and soft, ambient lighting. In the foreground, a modern sculpture made of smooth, curved metal is positioned on a pedestal, catching light on its polished surface. To the side, a large abstract painting with bold, vibrant colors hangs on the wall. The wooden floor adds warmth to the otherwise minimalist setting. The camera slowly glides from left to right, highlighting the artwork and creating a tranquil and contemplative atmosphere." + }, + { + "prompt_en": "bathroom", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "bathroom" + } + } + }, + "refined_prompt": "The video opens with a view of a modern bathroom, featuring a sleek, white countertop with a stylish sink. Above the sink, a large, rectangular mirror is mounted on the wall, reflecting the soft, ambient light from an overhead fixture. To the right of the sink, a neatly folded stack of white towels is arranged, and a small, elegantly potted plant adds a touch of greenery to the setting. The camera gently pans over the bathroom, highlighting the polished chrome fixtures and the clean, minimalist design of the space." + }, + { + "prompt_en": "bakery shop", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "bakery shop" + } + } + }, + "refined_prompt": "Inside a cozy bakery shop, the camera pans across a beautifully arranged display of freshly baked goods. The assortment includes golden-brown croissants, artisan loaves of bread, and colorful pastries. The warm lighting highlights the textures and colors of the baked items, creating an inviting atmosphere. Behind the counter, a cheerful female baker in her thirties, with a radiant smile, is seen arranging more treats. She is dressed in a neat white apron over a pastel-colored blouse. The camera slowly zooms in on her friendly face as she measures flour with precise movements." + }, + { + "prompt_en": "ballroom", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "ballroom" + } + } + }, + "refined_prompt": "In an elegant ballroom, the scene opens with a wide view of a sparkling chandelier hanging from the ceiling, casting a warm, ambient glow across the room. The camera gently pans down to reveal the polished wooden floor, where a couple performs a graceful waltz. The man, dressed in a sharp, black tuxedo, leads with confident steps, while the woman, adorned in a flowing, red gown, follows with effortless grace. Their movements are synchronized and fluid, creating a sense of charm and romance as they glide across the ballroom floor. The room is surrounded by ornate mirrors and adorned with lavish drapery, adding to the luxurious atmosphere. The camera slowly circles them, capturing the elegance of their dance from various angles." + }, + { + "prompt_en": "bar", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "bar" + } + } + }, + "refined_prompt": "A cozy bar is showcased with warm, ambient lighting highlighting the inviting atmosphere. The bar counter is made of polished wood, with several bar stools neatly aligned in front of it. Behind the counter, a selection of various liquor bottles is arranged on illuminated shelves, creating a colorful display. A bartender, a charming man in his early thirties with a well-groomed beard, wearing a crisp white shirt and a black vest, is seen expertly preparing a drink. He is focused and skillful, adding an air of professionalism to the scene. The camera slowly pans from the bartender to the row of stools, emphasizing the welcoming environment." + }, + { + "prompt_en": "barn", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "barn" + } + } + }, + "refined_prompt": "A rustic wooden barn stands amidst an open field, bathed in the warm glow of the late afternoon sun. The barn has deep red wooden siding with a slightly weathered appearance, and a classic pitched roof made of dark shingles. A white barn door, half-open, reveals a glimpse of hay bales inside. The surrounding landscape is dotted with patches of lush green grass. The camera slowly zooms in on the barn, capturing its charming and tranquil rural atmosphere." + }, + { + "prompt_en": "basement", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "basement" + } + } + }, + "refined_prompt": "In a dimly lit basement, scattered storage boxes and an old wooden shelf filled with various tools and items can be seen. The lighting creates a moody and slightly mysterious atmosphere, with shadows casting on the concrete walls and floor. Cobwebs are visible in the corners, indicating the basement's infrequent use. The camera gently pans across the area, revealing a dusty, forgotten charm. A single bare light bulb hangs overhead, swaying slightly and providing the main source of light." + }, + { + "prompt_en": "beach", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "beach" + } + } + }, + "refined_prompt": "A serene beach scene unfolds as gentle waves lap at the shore. The golden sand stretches out under the soft, warm glow of the setting sun, which casts long shadows and creates a peaceful ambiance. Seagulls can be seen flying overhead, occasionally dipping down to skim the water's surface. The horizon is a blend of vibrant oranges and pinks against the clear sky, enhancing the tranquil mood. The camera slowly pans along the shoreline, capturing the rhythmic motion of the waves and the expansive beauty of the beach." + }, + { + "prompt_en": "bedroom", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "bedroom" + } + } + }, + "refined_prompt": "The camera slowly pans across a cozy bedroom, illuminated by the gentle glow of a bedside lamp. A neatly made bed with soft, plush pillows and a textured blanket takes center stage in the room. Above the bed, there's a piece of artwork hanging on the wall, adding a touch of elegance to the space. A small wooden nightstand stands beside the bed, holding a lamp with a soft white shade and a few books neatly stacked. The overall ambiance is serene and inviting, with the warm lighting creating a peaceful atmosphere." + }, + { + "prompt_en": "bridge", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "bridge" + } + } + }, + "refined_prompt": "A long, elegant suspension bridge stretches across a wide river, its cables gracefully curving between two massive towers. The bridge is illuminated by streetlights, casting a soft, warm glow along its length. As the camera pans from left to right, the bridge's arches and architectural details become more apparent. In the background, the city skyline is visible under a clear night sky, adding an urban atmosphere to the scene. The river beneath reflects the lights, creating a shimmering effect on the water's surface." + }, + { + "prompt_en": "botanical garden", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "botanical garden" + } + } + }, + "refined_prompt": "In a botanical garden, the camera glides over a well-maintained pathway that meanders through a lush, vibrant landscape. The garden is filled with various exotic plants and colorful flowers, creating a serene and picturesque environment. Tall, green trees provide shade, while scattered benches invite relaxation. As the camera moves forward, it captures a fountain in the center, with water gently cascading down, adding a soothing element to the tranquil atmosphere. The sunlight filters through the leaves, creating dappled patterns on the ground." + }, + { + "prompt_en": "cafeteria", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "cafeteria" + } + } + }, + "refined_prompt": "In a bustling cafeteria, tables with trays of food can be seen, surrounded by people chatting and enjoying their meals. The warm lighting creates a welcoming atmosphere, highlighting the variety of colorful dishes. In the foreground, a cheerful woman in her twenties is carrying a tray with a sandwich and a drink, while wearing a stylish blouse and jeans. Her hair is elegantly styled, and a bright smile graces her face as she looks towards the camera. Behind her, the cafeteria's customers engage in conversation, creating a lively and friendly environment. The camera pans slightly to follow the movement of the woman as she walks to a nearby table." + }, + { + "prompt_en": "campsite", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "campsite" + } + } + }, + "refined_prompt": "The scene depicts a cozy campsite set in a lush forest environment. A large, sturdy tent with a warm, earthy color is pitched on a grassy area, with a few small logs arranged around a fire pit nearby. A gentle, crackling campfire flickers, casting a warm orange glow that illuminates the surrounding area. In the background, tall trees with verdant leaves sway slightly in the breeze, adding to the tranquil atmosphere. The camera moves slowly, capturing the serene and inviting ambiance of this outdoor retreat." + }, + { + "prompt_en": "campus", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "campus" + } + } + }, + "refined_prompt": "A beautiful university campus unfolds with well-maintained lawns and elegant, historic buildings. The camera begins with a panoramic sweep, showcasing a brick pathway lined with tall trees and vibrant green grass. Students, dressed in casual attire, are walking along the path, some carrying backpacks and others chatting in small groups. The atmosphere is lively and inviting, as the afternoon sun bathes the scene in a warm, golden light. The camera gently zooms in on a group of students sitting on a bench, engrossed in conversation and enjoying the serene ambiance of the campus." + }, + { + "prompt_en": "carrousel", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "carrousel" + } + } + }, + "refined_prompt": "A colorful carousel spins gently in a lively amusement park setting. The camera captures a close-up view of the carousel, showcasing its intricate details. The horses are ornately decorated in bright colors, with gold accents gleaming under the festive string lights that illuminate the scene. As the carousel rotates, the vintage music from the ride can almost be imagined, enhancing the nostalgic ambiance. The camera slowly pans around the carousel, revealing delighted faces of children and adults enjoying the ride." + }, + { + "prompt_en": "castle", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "castle" + } + } + }, + "refined_prompt": "The video captures a grand medieval castle, situated atop a rocky hill, with large stone walls and towering turrets reaching towards the sky. The camera glides smoothly from the base of the hill upwards, showcasing the imposing entrance gate and the detailed masonry of the structure. As the camera continues its ascent, it reveals flags fluttering gently in the breeze atop the towers. The setting sun bathes the scene in a golden light, adding a magical and enchanting ambiance to the castle's silhouette." + }, + { + "prompt_en": "cemetery", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "cemetery" + } + } + }, + "refined_prompt": "A serene scene unfolds in a cemetery, where rows of tombstones stand silently amid lush green grass. The sun casts a gentle, golden light over the area, creating a peaceful and respectful atmosphere. The camera slowly moves forward, capturing the intricate engravings on the closest tombstones. A light breeze sways a nearby tree, its leaves rustling softly. In the background, a small bouquet of flowers lays beside one of the graves, adding a touch of color to the monochrome landscape." + }, + { + "prompt_en": "classroom", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "classroom" + } + } + }, + "refined_prompt": "The scene depicts a brightly lit classroom with neat rows of desks and chairs. The walls are adorned with colorful educational posters and a large, whiteboard at the front of the room. Sunlight streams through the large windows, casting a warm glow over the classroom. The camera pans slowly from left to right, capturing the organized desks and the inviting, cheerful atmosphere." + }, + { + "prompt_en": "cliff", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "cliff" + } + } + }, + "refined_prompt": "A towering cliff made of rugged stone rises majestically against a background of a clear blue sky. The cliff's surface is textured with natural cracks and crevices, and sparse vegetation clings to the rock in some places. The sun casts dramatic shadows across the face of the cliff, highlighting its jagged edges and bold contours. The camera slowly pans upward, capturing the impressive height and structure of the cliff, creating a sense of awe and grandeur." + }, + { + "prompt_en": "crosswalk", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "crosswalk" + } + } + }, + "refined_prompt": "A bustling crosswalk in the city comes into view, with several people walking across in various directions. The camera focuses on a charming woman with a confident stride as she makes her way across. She has long, flowing hair and is wearing a stylish trench coat with fashionable ankle boots. Her eyes are focused ahead, and she carries a small handbag. Vehicles are stopped at the traffic light, and the city buildings rise in the background, creating an urban atmosphere. The camera smoothly follows her movement, enhancing the dynamic scene." + }, + { + "prompt_en": "construction site", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "construction site" + } + } + }, + "refined_prompt": "In an active construction site, a large yellow excavator is moving earth from a pile. The machine has a long arm with a bucket scooping soil and rocks, then rotating to deposit the material into a dump truck nearby. The site is bustling with activity, and the heavy machinery operates under the bright midday sun, casting shadows across the dirt. The camera smoothly pans from left to right, capturing the excavator's precise movements against the backdrop of partially constructed buildings and cranes looming in the distance." + }, + { + "prompt_en": "corridor", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "corridor" + } + } + }, + "refined_prompt": "A long, empty corridor stretches ahead, illuminated by overhead fluorescent lights that cast a bright, consistent glow along the space. The corridor has smooth, white walls and a polished, reflective floor. The camera slowly moves forward down the corridor, giving a sense of depth and perspective as doors on each side gradually come into view. The quiet, tranquil atmosphere adds a sense of calm and order to the scene." + }, + { + "prompt_en": "courtyard", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "courtyard" + } + } + }, + "refined_prompt": "A serene courtyard is presented, with a stone pathway leading through lush greenery. The courtyard is framed by tall, leafy plants and vibrant flowers, creating a peaceful and inviting atmosphere. Sunlight filters through the foliage, casting gentle shadows on the pathway. In the center of the courtyard, a small fountain bubbles softly, its water sparkling in the sunlight. The camera moves slowly from one side to the other, capturing the tranquility and natural beauty of the space." + }, + { + "prompt_en": "desert", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "desert" + } + } + }, + "refined_prompt": "The video captures a vast desert landscape under the clear blue sky. The camera begins with a wide shot of rolling sand dunes stretching out in every direction. The sun casts sharp shadows on the dunes, accentuating their graceful curves. As the camera tilts downward, a single cactus comes into view, standing solitary amid the sands. The gentle desert breeze causes the sand to shift slightly around the cactus, adding a subtle sense of motion to the serene and arid atmosphere." + }, + { + "prompt_en": "downtown", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "downtown" + } + } + }, + "refined_prompt": "A bustling downtown scene is captured with tall skyscrapers and buildings stretching into the sky. The foreground features a busy street filled with moving cars and pedestrians crossing at intersections. The buildings have reflective glass windows, showcasing the vibrant energy of the city. The atmosphere is lively, with bright digital advertisements and neon signs illuminating the area. The camera slowly pans across the street, highlighting the variety of architecture and the diverse activity of city life." + }, + { + "prompt_en": "driveway", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "driveway" + } + } + }, + "refined_prompt": "A neatly paved driveway stretches out in front of a charming suburban home. The driveway, composed of smooth gray concrete, is lined with well-manicured grass and vibrant flower beds. Sunlight casts soft shadows from nearby trees, enhancing the tranquil atmosphere. A sleek black sedan is parked halfway down the driveway, its polished surface reflecting the surrounding greenery. The camera slowly moves forward, capturing the inviting entrance of the home and the overall peaceful setting." + }, + { + "prompt_en": "farm", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "farm" + } + } + }, + "refined_prompt": "The scene captures a peaceful farm landscape with a wooden barn nestled among rolling green fields. A tractor is parked near the barn, and a few chickens can be seen pecking at the ground. The sun is setting in the distance, creating a golden hue that blankets the entire farm with a warm glow. A gentle breeze causes the nearby trees to sway slightly. The camera slowly pans across the scene, capturing the tranquility and rustic charm of the farm setting." + }, + { + "prompt_en": "food court", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "food court" + } + } + }, + "refined_prompt": "Inside a bustling food court, various food stalls line the perimeter, each showcasing vibrant signage and enticing displays of food. People of diverse appearances are seen walking around, holding trays filled with colorful assortments of food and beverages. In the center, a stylish man with sharp features and well-groomed hair wearing a casual button-up shirt navigates through the crowd carrying a tray. The atmosphere is lively and vibrant with people talking and enjoying their meals. Bright, overhead lighting illuminates the entire area, highlighting the variety of dishes and the energy of the crowd. The camera smoothly pans around the food court, capturing glimpses of the different stalls and the interactions between the people." + }, + { + "prompt_en": "football field", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "football field" + } + } + }, + "refined_prompt": "A vast football field is captured from a high angle, showcasing the vibrant green grass and the white boundary lines that distinctly mark the playing area. The camera moves smoothly across the field with no players in view, emphasizing the emptiness and tranquility. The goal posts stand prominently at each end, while the bright sunlight casts shadows on the field, creating a serene and inviting atmosphere. The stadium seats are partially visible in the distance, completing the scene." + }, + { + "prompt_en": "forest road", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "forest road" + } + } + }, + "refined_prompt": "A scenic forest road winds through a dense canopy of tall, leafy green trees. Sunlight filters through the leaves, casting dappled patterns of light and shadow on the road. The narrow, winding path stretches ahead, inviting a sense of tranquility and adventure. The camera gently pans along the road, revealing the lush greenery on both sides while highlighting the peaceful ambiance of the forest setting." + }, + { + "prompt_en": "fountain", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "fountain" + } + } + }, + "refined_prompt": "A beautifully detailed stone fountain stands in a serene garden setting. Water cascades gracefully from the top tier, creating a gentle stream that splashes into the lower basin. The sun casts a soft, shimmering reflection on the water, enhancing the tranquil atmosphere. Surrounding the fountain are vibrant green plants and colorful flowers, adding to the peaceful ambiance. The camera slowly circles the fountain, capturing its intricate design and the harmonious flow of water." + }, + { + "prompt_en": "gas station", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "gas station" + } + } + }, + "refined_prompt": "A gas station with bright overhead lights illuminating the area comes into view. There are a couple of fuel pumps with digital displays. A dark-colored car is parked next to one of the pumps, and a person wearing a jacket and jeans is standing beside the vehicle, holding a fuel nozzle to refuel the car. In the background, the convenience store attached to the gas station can be partially seen, with brightly lit windows displaying various signs and products. The image captures a bustling yet organized atmosphere typical of a busy gas station." + }, + { + "prompt_en": "glacier", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "glacier" + } + } + }, + "refined_prompt": "A majestic glacier stretches across the landscape, its ice shimmering under the bright sunlight. The camera captures the massive, blue-tinged ice formations with jagged peaks and crevices carving through its surface. As the camera glides over the glacier, the intricate details of the ice texture become visible, highlighting the natural beauty and vastness of the icy expanse. The surrounding atmosphere is serene, with a clear, crisp sky providing a striking backdrop to the glacier's brilliance. " + }, + { + "prompt_en": "golf course", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "golf course" + } + } + }, + "refined_prompt": "A panoramic view of a lush golf course is presented under clear blue skies. Rolling fairways stretch across the scene, interspersed with patches of vibrant green grass and strategically placed sand bunkers. In the foreground, a golfer, a handsome man with a confident demeanor, stands poised to take a swing. He wears a fitted polo shirt and tailored shorts, with a visor shielding his eyes from the sun. The camera gently pans across the landscape, capturing the manicured greens, pristine bunkers, and distant tree-lined horizon, creating a serene and picturesque atmosphere." + }, + { + "prompt_en": "indoor gymnasium", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "indoor gymnasium" + } + } + }, + "refined_prompt": "Inside an indoor gymnasium, the video captures a spacious area with polished wooden floors and high ceilings. To the left, a series of basketball hoops hang from the ceiling, and in the background, a vibrant mural decorates one of the walls. A row of bright overhead lights illuminates the entire space, creating a lively and energetic atmosphere. The camera slowly pans from left to right, capturing the full expanse of the gymnasium, including a few scattered gym mats and benches along the sides." + }, + { + "prompt_en": "harbor", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "harbor" + } + } + }, + "refined_prompt": "A picturesque view of a harbor comes into focus. The scene is set under a clear blue sky, with several boats gently bobbing in the water. The sunlight reflects off the water's surface, creating a shimmering effect. In the background, a row of small, charming buildings lines the waterfront, and a few seagulls can be seen gliding overhead. The camera pans slowly from left to right, capturing the tranquil and serene ambiance of the harbor." + }, + { + "prompt_en": "highway", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "highway" + } + } + }, + "refined_prompt": "The video opens with a sweeping aerial view of a busy highway. Cars and trucks are seen moving rapidly in both directions, their headlights and taillights creating a streaking effect against the asphalt. The scene captures the constant motion and energy of the highway environment. The lighting is bright and clear, suggesting daytime, and the surrounding landscape includes some greenery and distant buildings. The camera smoothly follows the flow of traffic, highlighting the vastness and liveliness of the scene." + }, + { + "prompt_en": "hospital", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "hospital" + } + } + }, + "refined_prompt": "A bright, modern hospital corridor is depicted, featuring a row of doors leading to various rooms on each side. The corridor is illuminated by overhead fluorescent lights, creating a clean and sterile atmosphere. The floor is polished, reflecting the light and emphasizing the immaculate surroundings. In the middle of the corridor, two medical carts are parked, each containing an assortment of medical supplies. The scene is calm and organized, with no people visible, allowing the focus to remain on the details of the hospital environment. The camera smoothly moves forward through the corridor, capturing the serene and efficient ambiance of the space." + }, + { + "prompt_en": "house", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "house" + } + } + }, + "refined_prompt": "A charming house stands enveloped in lush greenery, with its exterior featuring a combination of brickwork and pastel-colored wooden siding. The front facade is adorned with large windows that reflect the soft glow of the afternoon sun. An inviting porch stretches across the front, complete with a pair of rocking chairs and potted plants. The camera slowly pans from left to right, capturing the neat garden with its neatly trimmed hedges and vibrant flowers, enhancing the serene and picturesque atmosphere of the scene." + }, + { + "prompt_en": "iceberg", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "iceberg" + } + } + }, + "refined_prompt": "A large, majestic iceberg floats serenely in the open sea under a clear blue sky. The iceberg is massive, with jagged peaks and a surface that glistens in the sun. The surrounding water is a deep blue, contrasting sharply with the icy white and light blue hues of the iceberg. The camera slowly pans to the right, capturing the iceberg's impressive size and intricate shapes as it drifts gently away from the camera. The atmosphere is crisp and tranquil, with the gentle sound of the waves lapping against the iceberg." + }, + { + "prompt_en": "industrial area", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "industrial area" + } + } + }, + "refined_prompt": "An industrial area is seen from a high-angle view, showcasing a collection of large warehouses and factory buildings painted in a mix of gray and blue. Smoke slowly rises from several tall chimneys, merging into the overcast sky, creating a slightly gloomy atmosphere. The camera pans gently across the rooftops, revealing stacks of shipping containers lining the perimeter of the area. The industrial landscape is dotted with cranes and machinery, highlighting the bustling nature of the scene." + }, + { + "prompt_en": "jail cell", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "jail cell" + } + } + }, + "refined_prompt": "The scene captures the inside of a dimly lit jail cell, with shadows cast across the worn concrete walls. In the center, a simple metal bed with a thin mattress sits against one wall, with a gray wool blanket neatly folded at the foot. A small metal sink and toilet unit are visible in the corner, under a faint flickering light. The barred window high on the opposite wall lets in a sliver of light, creating a somber, confined atmosphere. The camera slowly pans across the cell, emphasizing its stark, cold environment." + }, + { + "prompt_en": "junkyard", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "junkyard" + } + } + }, + "refined_prompt": "A wide-angle view captures the sprawling expanse of a junkyard, filled with a diverse array of discarded items and vehicles. Rusted car bodies are stacked haphazardly, while piles of metal scrap glint in the sunlight. The ground is uneven with patches of dirt and scattered debris. The atmosphere is rugged and gritty, with the sun casting long shadows across the scene. As the camera slowly pans across the junkyard, details of crushed metal, old tires, and broken machinery come into focus, highlighting the textures and colors of the reclaimed materials." + }, + { + "prompt_en": "kitchen", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "kitchen" + } + } + }, + "refined_prompt": "A modern kitchen is depicted with sleek, white cabinetry and smooth countertops. A silver faucet and sink are positioned on the countertop, with a large window above them allowing natural light to stream in, creating a bright and inviting atmosphere. On the counter, there is a bowl of fresh vegetables, including tomatoes and lettuce, adding a pop of color to the scene. To the side, a stainless steel refrigerator stands, blending seamlessly with the contemporary design. The camera pans slowly from left to right, capturing the clean and organized space." + }, + { + "prompt_en": "indoor library", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "indoor library" + } + } + }, + "refined_prompt": "The scene opens inside a library filled with tall bookshelves that reach almost to the ceiling. These shelves are packed with an array of books, all organized neatly. The warm, inviting lighting highlights the wood tones of the shelves and the cozy atmosphere of the room. Near the center, a narrow wooden table is seen with a few scattered open books and a laptop. The camera slowly pans across the shelves, focusing on the spines of the books, before circling around to give a full view of the quiet, serene environment within the library." + }, + { + "prompt_en": "lighthouse", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "lighthouse" + } + } + }, + "refined_prompt": "A tall, classic lighthouse stands majestically against the backdrop of a clear blue sky. The camera begins with a wide shot, capturing the white tower with red accents and the surrounding rocky shoreline. As the camera gradually moves in closer, the details of the lighthouse become more pronounced, revealing its windows and the contrasting red top where the beacon is housed. Sunlight reflects against the structure, highlighting its white paint. The camera subtly pans around the lighthouse, offering a view of the sparkling ocean in the distance." + }, + { + "prompt_en": "laboratory", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "laboratory" + } + } + }, + "refined_prompt": "The scene captures a modern laboratory filled with scientific equipment. A sleek, stainless steel workbench is prominently positioned in the center, featuring various instruments such as test tubes, beakers, and digital displays. Bright overhead lighting illuminates the clean, organized space, highlighting the precision and order of the lab. The camera gently pans from left to right, showcasing additional shelves lined with labeled chemical bottles and assorted lab tools. The entire setting conveys an atmosphere of focus and innovation within the scientific research environment." + }, + { + "prompt_en": "mansion", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "mansion" + } + } + }, + "refined_prompt": "The camera presents a view of a grand mansion surrounded by beautifully manicured gardens. The stately mansion features an elegant facade with large, arched windows and a grand entrance framed by tall columns. The warm lighting from the interior softly illuminates the exterior, creating a welcoming ambiance. As the camera slowly moves closer, details like intricate stonework and a lush green lawn become apparent, enhancing the opulence of the scene." + }, + { + "prompt_en": "marsh", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "marsh" + } + } + }, + "refined_prompt": "A marsh landscape is depicted, with patches of green grass and reeds interspersed throughout the wet, muddy ground. The scene captures the serene, natural beauty of the area, with the sunlight filtering through the clouds and casting a soft, diffused light. Small pools of water are scattered among the vegetation, reflecting the sky above. The camera gently pans across the marsh, creating a sense of calm and tranquility while highlighting the varied textures and colors of this wetlands environment." + }, + { + "prompt_en": "mountain", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "mountain" + } + } + }, + "refined_prompt": "The camera captures a majestic mountain towering against a clear, bright blue sky. The sun bathes the rugged slopes in warm, golden light, highlighting the peaks and rugged surfaces. Snow caps the mountain's highest points, glistening in the sunlight. The camera gently pans from the base of the mountain upwards, revealing more details of the rocky terrain and sparse patches of greenery. The atmosphere is tranquil and awe-inspiring, with a gentle breeze suggested by the movement of wispy clouds in the sky." + }, + { + "prompt_en": "indoor movie theater", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "indoor movie theater" + } + } + }, + "refined_prompt": "Inside an indoor movie theater, plush red seats are arranged in rows, facing a large, blank movie screen. The theater is dimly lit, with subtle overhead lights casting a soft glow. The camera pans slowly from the front of the auditorium towards the back, capturing the textured walls and shadowy ambiance. The scene conveys a serene and quiet atmosphere before the movie begins. The camera finally pauses with a focus on the middle row, offering a cozy view of the theater's inviting environment." + }, + { + "prompt_en": "indoor museum", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "indoor museum" + } + } + }, + "refined_prompt": "In an indoor museum gallery, a variety of artifacts are on display. The room is bathed in soft, ambient lighting that highlights the intricate details of the exhibits. One prominent exhibit consists of a series of ancient pottery items, each resting on a pedestal with a descriptive plaque. The walls are adorned with framed paintings of historical scenes, adding a touch of art to the space. The camera gently pans across the room, capturing the elegance and sophistication of the museum's ensemble, while the polished wooden floors reflect the surrounding exhibits." + }, + { + "prompt_en": "music studio", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "music studio" + } + } + }, + "refined_prompt": "In a music studio filled with high-tech equipment, a polished wooden console desk dominates the scene. The desk is adorned with an array of mixers, glowing LED buttons, and a computer monitor displaying an audio editing software interface. Large speakers are positioned on either side of the desk. The studio ambiance is softly lit, with a subtle blue hue illuminating the soundproof walls. The camera pans slowly from left to right, capturing the precision and modernity of the setup, as a gentle hum of background sound resonates through the room." + }, + { + "prompt_en": "nursery", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "nursery" + } + } + }, + "refined_prompt": "A cozy and brightly lit nursery room features a baby crib with soft pastel-colored bedding on the left side. Delicate floral patterns adorn the crib's bedding, creating a warm and inviting atmosphere. On the wall behind the crib, a mobile with charming plush animals is hanging, slowly rotating. To the right, a tall shelf displays an array of colorful children's books and stuffed animals. The window lets in gentle, natural light that illuminates the space, enhancing its tranquil and nurturing ambiance. The camera pans slowly across the room, capturing the serene details." + }, + { + "prompt_en": "ocean", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "ocean" + } + } + }, + "refined_prompt": "The camera offers a sweeping view of the vast ocean with gentle waves rippling across the water's surface. The sun is low in the sky, casting a golden hue over the scene and causing the water to shimmer. As the camera glides slightly forward, the horizon stretches endlessly, giving a sense of tranquility and expansiveness. Seagulls occasionally soar across the view, adding life to the serene atmosphere." + }, + { + "prompt_en": "office", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "office" + } + } + }, + "refined_prompt": "The scene is set in a modern office with sleek furnishings and large windows that let in ample natural light. A stylish desk is positioned near the window, with a computer monitor, a stack of papers, and a potted plant neatly arranged on it. A sophisticated office chair is pushed slightly away from the desk. The atmosphere is professional and organized, creating a sense of productivity and focus. The camera pans slowly from right to left, capturing the layout of the office and its contemporary decor." + }, + { + "prompt_en": "palace", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "palace" + } + } + }, + "refined_prompt": "The camera smoothly pans across a grand palace against a backdrop of a clear blue sky. The palace features elaborate architecture with tall spires, intricate carvings, and ornate balconies. Its facade is a blend of white stone and golden accents, glistening in the sunlight. Majestic columns support the entrance, and lush green gardens can be glimpsed at the palace's perimeter. The scene exudes an atmosphere of regal elegance, with the camera capturing the palace's grandeur from a slightly elevated angle." + }, + { + "prompt_en": "parking lot", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "parking lot" + } + } + }, + "refined_prompt": "A parking lot filled with a variety of vehicles, including cars and a few larger trucks, is seen from a slightly elevated angle. Bright sunlight casts clear shadows of the vehicles on the ground, and the painted lines marking parking spaces are visible on the asphalt. The camera slowly pans from left to right, showcasing the arrangement and diversity of vehicles parked in the lot. A gentle breeze causes nearby trees to sway slightly in the background, adding a sense of calm to the busy setting." + }, + { + "prompt_en": "pharmacy", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "pharmacy" + } + } + }, + "refined_prompt": "Inside a brightly lit pharmacy, rows of neatly organized shelves filled with various medications and health products stretch into the distance. The camera smoothly pans down one of the aisles, capturing a diverse range of products with colorful packaging. The atmosphere is clean and orderly, with a soft ambient light creating a welcoming environment. A cheerful male pharmacist in a white coat stands at the counter near the aisles, ready to assist customers, contributing to a professional and approachable atmosphere." + }, + { + "prompt_en": "phone booth", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "phone booth" + } + } + }, + "refined_prompt": "A classic red British phone booth stands on a cobblestone street, surrounded by charming historic buildings. The booth's glossy paint shines in the soft afternoon light. Its iconic glass panels reflect the surrounding architecture, and the black \"TELEPHONE\" inscription stands out prominently at the top. The camera slowly zooms in, capturing small details like the brass door handle and the interior rotary dial, while the atmospheric lighting adds a nostalgic touch to the scene." + }, + { + "prompt_en": "raceway", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "raceway" + } + } + }, + "refined_prompt": "A wide view of a raceway stretches into the distance, surrounded by rolling hills and lush greenery. The sun casts a bright, clear light on the asphalt track, reflecting off its surface. Several colorful race cars speed down the track, leaving behind a trail of shimmering heat waves. The roar of engines fills the air, adding to the exhilarating atmosphere. The camera pans smoothly along the track, capturing the intensity and speed of the race, while a banner with sponsor logos flutters in the breeze above the grandstands." + }, + { + "prompt_en": "restaurant", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "restaurant" + } + } + }, + "refined_prompt": "The scene captures a cozy restaurant interior, softly lit with warm, ambient lighting to create a welcoming atmosphere. In the foreground, a beautifully set table is arranged with elegant dinnerware, including white plates, polished silverware, and folded napkins. A small vase with fresh flowers adds a touch of charm to the table. In the background, a row of upholstered chairs lines the table, and a large window reveals a soft glow from outside, hinting at dusk. The overall setting suggests a warm, inviting ambiance perfect for a comfortable dining experience." + }, + { + "prompt_en": "river", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "river" + } + } + }, + "refined_prompt": "A scenic view of a river gently flowing through a lush, green landscape. The sunlight shimmers on the water's surface, creating a sparkling effect. On the riverbanks, tall grasses and a few wildflowers sway lightly in the breeze. The camera pans slowly along the river, capturing the peaceful movement of the water and the natural beauty of the surroundings." + }, + { + "prompt_en": "science museum", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "science museum" + } + } + }, + "refined_prompt": "Inside a science museum, the camera gently pans across an engaging exhibit. In the foreground, a large, interactive display showcases a glowing model of the solar system. Planets of various sizes orbit around a radiant sun, with each one differing in color and size, suspended from thin wires to create a floating effect. In the background, detailed informational panels provide facts about the planets and the universe. The ambient lighting is dim, emphasizing the illuminated features of the exhibit and creating an atmosphere filled with curiosity and discovery. The camera subtly moves forward, focusing on the rotating planets, highlighting the awe-inspiring design." + }, + { + "prompt_en": "shower", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "shower" + } + } + }, + "refined_prompt": "In a modern bathroom, water cascades from a sleek, stainless steel showerhead mounted on the wall. The shower stream creates a soft mist as it hits the glass door, which is slightly fogged up from the steam. Warm light from above illuminates the scene, casting subtle reflections on the tiled walls and floor. The shower area exudes a clean and contemporary feel. The camera slowly zooms in on the water droplets collecting and sliding down the glass, emphasizing the serene and refreshing atmosphere." + }, + { + "prompt_en": "ski slope", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "ski slope" + } + } + }, + "refined_prompt": "A pristine ski slope covered in glistening snow stretches out ahead, with tall, snow-dusted pine trees lining the edges. The golden morning sunlight casts long shadows across the surface, emphasizing the smooth tracks made by previous skiers. In the distance, majestic mountain peaks rise against a clear blue sky, completing the breathtaking winter scene. The camera moves smoothly down the slope, capturing the serene and exhilarating atmosphere of this snowy landscape." + }, + { + "prompt_en": "sky", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "sky" + } + } + }, + "refined_prompt": "The scene captures a clear and expansive sky, stretching infinitely across the horizon. The camera tilts upwards to reveal the deep blue canvas, dotted with a few fluffy white clouds drifting peacefully. The sunlight bathes the sky in a warm glow, creating a serene and tranquil atmosphere. As the camera gently pans, the subtle variations in color and cloud formations become more apparent, enhancing the beauty and depth of the sky." + }, + { + "prompt_en": "skyscraper", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "skyscraper" + } + } + }, + "refined_prompt": "The camera slowly pans upward, capturing the sleek and modern architecture of a towering skyscraper. The building's reflective glass facade mirrors the bright blue sky and fluffy white clouds above. As the camera continues to ascend, the intricate details of the building's exterior become more pronounced, revealing rows of large windows and a series of elegant, geometric patterns. The sunlight glints off the glass, highlighting the height and grandeur of the structure. The atmosphere feels vibrant and bustling, suggesting the energy of a thriving urban environment." + }, + { + "prompt_en": "baseball stadium", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "baseball stadium" + } + } + }, + "refined_prompt": "The camera pans over a baseball stadium on a sunny day, revealing a lush green field below. The stands are filled with colorful seats, although they are currently empty. A scoreboard is visible in the distance, displaying the team's logos and scores. Towering lights are positioned around the stadium, ready to illuminate the field for evening games. The atmosphere is calm yet anticipatory, as if the stadium is awaiting the arrival of enthusiastic fans and the start of an exciting game." + }, + { + "prompt_en": "staircase", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "staircase" + } + } + }, + "refined_prompt": "A grand wooden staircase is captured from a dramatic low-angle perspective, emphasizing its elegant and sweeping design. The staircase features polished wooden steps and a beautifully crafted railing adorned with intricate balusters. Soft, ambient lighting highlights the rich wood tones, creating a warm and inviting atmosphere. As the camera gently pans upward, the staircase ascends gracefully, drawing the viewer's eye to the levels above." + }, + { + "prompt_en": "street", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "street" + } + } + }, + "refined_prompt": "A narrow cobblestone street lined with charming, old-world buildings on both sides comes into view. The camera glides smoothly down the street, capturing the warm glow of streetlights casting soft pools of light on the cobblestones. Balconies adorned with potted plants and flowers add a touch of color and nature to the scene. A gentle breeze rustles the leaves of nearby trees. The street is peaceful and quiet, evoking a sense of nostalgia and history as the camera moves forward." + }, + { + "prompt_en": "supermarket", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "supermarket" + } + } + }, + "refined_prompt": "The video captures the interior of a bustling supermarket aisle filled with neatly stacked shelves of colorful products. Bright fluorescent lighting illuminates the scene, creating a vibrant and energetic atmosphere. The camera glides smoothly along the aisle, revealing a diverse selection of items such as cereal boxes, canned goods, and snacks. Shoppers are visible in the distance, moving about as they browse the items. The orderly arrangement of products and the lively yet organized environment reflect the supermarket's welcoming appeal." + }, + { + "prompt_en": "indoor swimming pool", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "indoor swimming pool" + } + } + }, + "refined_prompt": "An elegant indoor swimming pool features calm blue water, reflecting atmospheric lighting from overhead fixtures. The pool is surrounded by sleek tiled flooring and cushioned lounge chairs positioned along the edge. Large windows line one side of the room, allowing a soft, natural light to filter in, contrasting with the artificial lighting above. The serene setting creates a tranquil and inviting atmosphere, inviting viewers to imagine the calmness of a swim. The camera smoothly pans across the pool's surface, capturing the subtle ripples and the intricate details of the poolside ambiance." + }, + { + "prompt_en": "tower", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "tower" + } + } + }, + "refined_prompt": "A tall, stone tower stands against a clear blue sky, its weathered facade giving it an air of historical significance. The tower is cylindrical in shape with narrow windows spiraling upwards. Sunlight reflects off the tower, highlighting its textures and intricate architectural details. The base of the tower is surrounded by lush green grass and a few scattered trees. The camera starts at the base and smoothly pans upwards, capturing the full height of the tower against the expansive sky." + }, + { + "prompt_en": "outdoor track", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "outdoor track" + } + } + }, + "refined_prompt": "A vibrant outdoor track scene is captured with the camera initially focusing on the bright red lanes, which contrast against the lush green grass surrounding the track. The camera pans to reveal several track lanes curving around, bordered by white lines. In the background, a few trees sway gently in the breeze under a clear blue sky. The lighting is natural and bright, highlighting the distinct colors and giving a lively and energetic atmosphere to the track environment." + }, + { + "prompt_en": "train railway", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "train railway" + } + } + }, + "refined_prompt": "A train railway stretches into the distance, with parallel tracks running through a lush, green landscape. The metallic rails glisten in the sunlight, and wooden sleepers are evenly spaced beneath them. The camera pans slowly down the length of the railway, capturing the natural beauty of the surrounding countryside. Tall trees line the sides of the tracks, their leaves rustling gently in the breeze. The scene is calm and serene, with only the sound of nature accompanying the view." + }, + { + "prompt_en": "train station platform", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "train station platform" + } + } + }, + "refined_prompt": "The video showcases a bustling train station platform during the day. The platform is lined with tracks on one side, while on the other side, there are signs and benches for passengers. A sleek, modern train is stationary, its doors open for boarding. The light shines down from overhead, illuminating the platform and highlighting the people boarding the train. Commuters hurry along the platform, some carrying briefcases and backpacks, while others pull luggage behind them. The atmosphere feels busy and dynamic, capturing the essence of travel and movement. The camera smoothly pans across the scene, capturing both the train and the buzz of activity on the platform." + }, + { + "prompt_en": "underwater coral reef", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "underwater coral reef" + } + } + }, + "refined_prompt": "The camera dives beneath the ocean's surface to reveal a vibrant underwater coral reef. The reef is teeming with colorful corals in shades of purple, orange, and pink, forming intricate and mesmerizing patterns. Small fish dart around playfully, weaving through the coral formations. Shafts of sunlight penetrate the clear water, creating a beautiful, dappled effect on the reef. The camera glides smoothly over the scene, showcasing the diverse marine life and the stunning details of the coral structures." + }, + { + "prompt_en": "valley", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "valley" + } + } + }, + "refined_prompt": "An expansive view of a lush valley unfolds before the viewer, with rolling green hills stretching into the distance. The camera smoothly pans across the landscape, revealing a river meandering through the valley floor, reflecting the bright morning sunlight. Trees dot the landscape, their leaves rustling in a gentle breeze, as fluffy white clouds drift lazily across the blue sky, enhancing the serene and picturesque atmosphere." + }, + { + "prompt_en": "volcano", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "volcano" + } + } + }, + "refined_prompt": "An impressive volcano stands majestically against a clear blue sky. The camera captures the view from a distance, slowly zooming in to reveal the distinct, rugged surface of the mountain. Smoke subtly rises from the volcano's crater, drifting upwards and dispersing into the air. The sunlight casts a warm glow over the scene, highlighting the earthy tones of the volcanic rock. The surrounding landscape features a mix of green vegetation and rocky terrain, adding depth to the atmosphere. The camera gently pulls back, providing a full view of the volcano amidst its natural surroundings." + }, + { + "prompt_en": "waterfall", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "waterfall" + } + } + }, + "refined_prompt": "A breathtaking waterfall cascades down a rocky cliff surrounded by lush greenery. The water flows powerfully, creating a misty spray at the base as it plunges into a clear pool below. Sunlight filters through the trees, casting dappled light and creating small rainbows in the mist. The sound of rushing water and the serene atmosphere add to the overall beauty of the scene. The camera starts by focusing on the top of the waterfall and slowly moves down to reveal the entire cascade and the pool below." + }, + { + "prompt_en": "windmill", + "dimension": [ + "scene", + "background_consistency" + ], + "auxiliary_info": { + "scene": { + "scene": { + "scene": "windmill" + } + } + }, + "refined_prompt": "A picturesque windmill stands against a clear blue sky, its large blades slowly turning in a gentle breeze. The structure is set on a lush green landscape, with wildflowers dotting the ground. The sun casts warm light, highlighting the wooden texture of the windmill's tower. The camera gently pans around the windmill, capturing the peaceful and serene ambiance of the countryside." + }, + { + "prompt_en": "a bicycle on the left of a car, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bicycle", + "object_b": "car", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In this scene, a bicycle is positioned to the left of a sleek car, both viewed from the front. The bicycle has a metallic frame and features a shiny chrome finish, with handlebars and a basket. The car next to it is a stylish model with a glossy black exterior. The bicycles' wheels are perfectly aligned with the car's front left wheel. The background is a calm street setting, with the sun illuminating both vehicles, giving them a subtle reflective shine. The camera captures this scene from a slightly low angle to emphasize the harmonious positioning of the bicycle and the car." + }, + { + "prompt_en": "a car on the right of a motorcycle, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "car", + "object_b": "motorcycle", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In an engaging front view scene, a sleek red car is positioned to the right of a stylish motorcycle on a road. The car's polished exterior gleams in the daylight, with its headlights bright and focused forward. The motorcycle, with its gleaming chrome and elegant design, adds contrast and appeal as its rider sits with a poised posture. The sky overhead is clear, and the camera maintains a steady angle, capturing both vehicles side by side as they are poised for action. The road appears smooth and inviting, stretching into the distance." + }, + { + "prompt_en": "a motorcycle on the left of a bus, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "motorcycle", + "object_b": "bus", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In the foreground of the scene, there is a striking motorcycle parked on the left side of a large bus. The camera captures a front view of both vehicles. The motorcycle features a sleek design with a shiny metallic finish, prominent handlebars, and a sturdy frame. The bus, with its expansive front windshield and vibrant exterior, towers beside the motorcycle. The scene is set against a bright clear sky, creating a stark contrast between the powerful motorcycle and the robust bus. The camera gently zooms in, highlighting the details of the motorcycle and the bus side by side." + }, + { + "prompt_en": "a bus on the right of a traffic light, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bus", + "object_b": "traffic light", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "A large bus is situated on the right side of a traffic light, with its front facing towards the camera. The bus is painted a bright color, featuring a sleek design with large windows at the front. The traffic light is mounted on a pole, showing red, and positioned to the left of the frame. The scene is set during the day, with clear skies and vibrant lighting, casting reflections on the bus's windows. The camera remains steady, capturing the front of the bus and the traffic light in a balanced composition." + }, + { + "prompt_en": "a traffic light on the left of a fire hydrant, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "traffic light", + "object_b": "fire hydrant", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "The scene captures a front view of a traffic light positioned on the left side of a bright red fire hydrant. The traffic light displays a green light, signaling go. The background shows a blurred urban setting with hints of buildings and a street. The lighting is bright, suggesting a sunny day, which casts sharp shadows on the ground. The camera remains steady, focusing on these two objects as part of a cityscape." + }, + { + "prompt_en": "a fire hydrant on the right of a stop sign, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "fire hydrant", + "object_b": "stop sign", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In the forefront, a vibrant red fire hydrant stands prominently on the right side of the scene. Just to its left, a stop sign is clearly visible, positioned at equal height. The fire hydrant's glossy surface reflects the ambient light, adding an extra dimension to its color. The stop sign displays bold white letters against its bright red background, set against a clear blue sky. The camera angle provides a straight-on front view, creating a balanced and straightforward composition. Subtle shadows are cast by both the hydrant and the sign, enhancing the sense of depth in the scene." + }, + { + "prompt_en": "a stop sign on the left of a parking meter, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "stop sign", + "object_b": "parking meter", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a front-view scene, a stop sign is prominently displayed on the left side, with its red, octagonal shape and bold white lettering. To the right of the sign stands a modern parking meter, sleek and metallic, with a digital display and card slot visible. The scene is set outdoors, and the lighting suggests a clear day, casting gentle shadows on the ground. The camera is steady, capturing both objects clearly in the frame against a neutral background." + }, + { + "prompt_en": "a parking meter on the right of a bench, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "parking meter", + "object_b": "bench", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "A parking meter stands prominently on the right side of a wooden bench in a quaint urban setting. The parking meter is painted in a sleek metallic silver with a clear display panel at the top. The bench, made of dark wood with metal armrests, contrasts nicely with the meter. The scene is illuminated by the warm glow of street lighting, casting soft shadows beneath the bench and parking meter. The camera angle is from the front, capturing both objects clearly against the backdrop of a cobblestone pathway." + }, + { + "prompt_en": "a bench on the left of a truck, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bench", + "object_b": "truck", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a serene outdoor setting, a wooden bench is positioned on the left side of a large, white truck. The bench, made of dark wood, has a simple yet elegant design. The truck, seen from the front view, is parked on a grassy area, and its polished surface gleams in the sunlight. The scene exudes a peaceful atmosphere, with the quiet environment accentuated by the gentle movement of leaves in a light breeze. The camera captures this moment from a steady angle, focusing on the bench and truck with a slight tilt to emphasize their placement and highlight the tranquil setting." + }, + { + "prompt_en": "a truck on the right of a bicycle, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "truck", + "object_b": "bicycle", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "From a front view, a large truck is seen on the right side of a bicycle. The truck has a sleek, polished exterior with a chrome grille and bright headlights. The bicycle to its left is a classic road bike with a slender frame and thin tires. The cyclist is a fit man, wearing a bright yellow cycling jersey and black shorts, looking focused as he pedals forward. The road stretches ahead, and the sunlight casts crisp shadows, enhancing the contrast between the truck and the bicycle. The camera is positioned to capture both the vehicles evenly, offering a balanced view of the scene." + }, + { + "prompt_en": "a bird on the left of a cat, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bird", + "object_b": "cat", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a vibrant garden setting, a beautiful bird perches gracefully to the left of a cat. The bird, vibrant with colorful feathers, peers curiously as it sits still. The cat, with sleek fur and bright, attentive eyes, gazes directly forward from where it sits on soft green grass. The sunlight filters through the leaves, casting dappled shadows, enhancing the serene and picturesque scene. The camera, capturing a front view, slowly pans to showcase both the bird and the cat in harmony." + }, + { + "prompt_en": "a cat on the right of a dog, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "cat", + "object_b": "dog", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a cozy indoor setting, a charming cat and an adorable dog sit side by side, facing the camera. The cat, positioned to the right of the dog, has a sleek fur coat and bright, curious eyes. The dog, with a friendly expression, has a fluffy coat and alert ears. Both animals gaze directly ahead, exuding a sense of calmness and companionship. The background is softly blurred, focusing attention on the cute pair. The warm lighting enhances the peaceful and inviting atmosphere." + }, + { + "prompt_en": "a dog on the left of a horse, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "dog", + "object_b": "horse", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a serene outdoor setting, a dog stands to the left of a horse from a front view perspective. The dog is a medium-sized canine with a shiny coat, looking charming as it stands attentively. It has a cheerful expression with its ears perked up, complementing the horse's majestic presence. The horse, a tall and handsome creature, has a sleek mane and stands proudly, gazing forward. The lighting is soft, highlighting the textures of both the animals' coats, and creating a harmonious and peaceful atmosphere. The camera captures this harmonious duo head-on, focusing on their serene expressions." + }, + { + "prompt_en": "a horse on the right of a sheep, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "horse", + "object_b": "sheep", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In the scene, a majestic horse is standing on the right side of a sheep, both facing the camera. The horse, with its sleek coat and strong build, exudes grace and strength. It flicks its lush mane in the gentle breeze. The sheep, with its fluffy wool and gentle eyes, stands close beside the horse, adding contrast and charm to the scene. The two animals are positioned on a grassy field, with a soft, diffused light illuminating the setting, creating a serene and harmonious atmosphere. The camera captures this harmonious duo from a front view, highlighting the natural beauty of the animals and their surroundings." + }, + { + "prompt_en": "a sheep on the left of a cow, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "sheep", + "object_b": "cow", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a serene rural setting, a fluffy white sheep stands on the left side of a large, gentle-eyed cow. The scene is captured from a front view, with both animals calmly facing the camera. The woolly texture of the sheep contrasts with the cow's smooth, dark coat, creating a charming and picturesque image. The ground beneath them is covered in short green grass, and the light is soft, illuminating their fur and features beautifully. The atmosphere is peaceful and pastoral, evoking a sense of harmony and tranquility in this natural setting." + }, + { + "prompt_en": "a cow on the right of an elephant, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "cow", + "object_b": "elephant", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In this scene, a cow stands to the right of an elephant, both facing the camera from a front view. The cow, with its smooth coat and distinctive markings, looks curiously ahead, while the elephant’s textured skin and large, flapping ears draw attention. The background showcases a natural setting, with lush greenery surrounding the pair. The lighting is soft, highlighting the textures and features of both animals. The camera gently zooms in, focusing on the unique contrast between the two creatures." + }, + { + "prompt_en": "an elephant on the left of a bear, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "elephant", + "object_b": "bear", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In this scene, an elephant stands to the left of a bear, both facing the camera. The elephant, with its large ears and long trunk, is a majestic gray with textured skin and tusks peeking out. Its trunk is slightly curved, adding a sense of grace. Next to it, the bear, with a thick coat of dark brown fur, looks robust and powerful, its ears perked up and eyes curious. The ground beneath them is covered in dry grass, with a few scattered rocks and patches of earth, creating a natural setting for these magnificent creatures. The sunlight casts a warm glow, highlighting their distinct features." + }, + { + "prompt_en": "a bear on the right of a zebra, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bear", + "object_b": "zebra", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a natural setting, a bear stands on the right beside a zebra, and both animals face the front, offering a full view of their features. The bear, with its thick fur and robust build, stands calmly, its eyes focused ahead. The zebra, with its striking black and white stripes, stands with an alert posture, its mane slightly ruffled by a gentle breeze. The scene is captured from a low angle, enhancing the majestic appearance of both animals against the backdrop of a clear blue sky. The atmosphere is serene, highlighting the harmony between these two diverse species in their shared environment." + }, + { + "prompt_en": "a zebra on the left of a giraffe, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "zebra", + "object_b": "giraffe", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "From a front view perspective, a zebra stands on the left side of a giraffe on a grassy plain. Both animals are stationary, showcasing their unique features. The zebra, with its striking black and white stripes, has its head turned slightly towards the camera, its ears perked up inquisitively. The giraffe towers over the zebra, its elegant neck stretched tall, with large, gentle eyes gazing straight ahead. The scene is bathed in the warm glow of the morning sun, casting soft shadows on the ground. Occasionally, a gentle breeze rustles through the grass, adding a sense of tranquility to the setting. The camera remains steady, capturing the serene beauty of the two creatures standing side by side." + }, + { + "prompt_en": "a giraffe on the right of a bird, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "giraffe", + "object_b": "bird", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a serene setting, a graceful giraffe stands on the right side of a small bird. The giraffe is tall and elegant, its spotted neck arching gently upwards, while the bird perches calmly nearby. The background is a soft, natural landscape with a hint of earthy tones, highlighting the harmony of nature. The camera captures this scene from the front, providing a clear and charming view of both creatures." + }, + { + "prompt_en": "a bottle on the left of a wine glass, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bottle", + "object_b": "wine glass", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a brightly lit scene, a wine bottle stands proudly to the left of a wine glass. The bottle is elegantly designed, with a deep green hue and a classy label, making it a focal point of sophistication. The wine glass is crystal clear, and its elegant shape reflects light beautifully. Both the bottle and glass are placed on a smooth, glossy surface that captures their reflections. The camera is positioned at a front view, creating a harmonious and balanced composition." + }, + { + "prompt_en": "a wine glass on the right of a cup, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "wine glass", + "object_b": "cup", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a softly lit setting, a wine glass stands elegantly on the right side of a simple, white cup. The camera captures a front view of both objects, with the wine glass tall and slender, and the cup short and with a gentle curve. The light creates subtle reflections on the glass, adding a touch of sophistication to the scene. The setup is minimalistic, with both items sitting on a smooth, reflective surface that mirrors their shapes. The atmosphere is calm and refined, emphasizing the contrast between the two vessels." + }, + { + "prompt_en": "a cup on the left of a fork, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "cup", + "object_b": "fork", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a well-lit kitchen setting, there's a white ceramic cup placed prominently on the left side of a shiny silver fork. The cup is simple yet elegant, with a smooth finish and a curved handle. The fork is laying horizontally beside the cup, its tines pointing toward the right. The camera maintains a steady front view, focusing on the two items against a neutral backdrop, creating a clean and minimalist composition. The lighting highlights the cup's surface and the fork's metallic gleam, adding a touch of sophistication to the scene." + }, + { + "prompt_en": "a fork on the right of a knife, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "fork", + "object_b": "knife", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "A sleek, metallic fork is placed to the right of a shiny knife, each standing vertically against a neutral background. The camera captures them from a front view, highlighting their polished surfaces and simple elegance. The soft lighting accentuates the reflective quality of the stainless steel, casting gentle shadows behind the utensils. The composition conveys a sense of balance and sophistication. The background remains minimalistic, drawing attention to the details of the fork and knife." + }, + { + "prompt_en": "a knife on the left of a spoon, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "knife", + "object_b": "spoon", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "A front view showcases a sleek knife positioned to the left of a polished spoon. Both utensils are placed on a smooth, white surface, reflecting soft ambient light. The knife, with its gleaming blade and simple handle, contrasts with the spoon's rounded, reflective surface. The camera captures these elements at eye level, emphasizing the clean and minimalist arrangement. The subtle shadows cast by the utensils add depth to the scene, highlighting their shape and form." + }, + { + "prompt_en": "a spoon on the right of a bowl, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "spoon", + "object_b": "bowl", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a brightly lit kitchen setting, there is a simple white ceramic bowl positioned in the center of the scene. To the right of the bowl, a shiny silver spoon is placed, resting delicately on the wooden surface of the table. The front view highlights the contrasting textures of the sleek spoon and the smooth bowl. The background is slightly blurred, emphasizing the bowl and spoon as the focus of the scene. The camera remains steady, capturing this minimalist arrangement with precision." + }, + { + "prompt_en": "a bowl on the left of a bottle, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bowl", + "object_b": "bottle", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a well-lit room, a ceramic bowl and a glass bottle are arranged on a wooden table. The bowl, featuring a vibrant, hand-painted floral design, is positioned to the left of the bottle. The glass bottle, filled with a light amber liquid and topped with a cork, catches the ambient light, creating subtle reflections. The front-facing camera captures both the bowl and the bottle clearly, showcasing their textures and details against a softly blurred background for a serene and pleasant atmosphere." + }, + { + "prompt_en": "a potted plant on the left of a remote, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "potted plant", + "object_b": "remote", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "A small potted plant sits to the left of a television remote control on a wooden table. The plant features lush green leaves that cascade gently over the edges of a plain white planter, giving it a fresh and vibrant appearance. The remote control, sleek and black, lies flat next to the plant, with its buttons facing upwards. The camera captures this scene from a front view, providing a clear and balanced composition of the two objects against a softly blurred background. The overall atmosphere is calm and peaceful, highlighting a simple yet elegant arrangement." + }, + { + "prompt_en": "a remote on the right of a clock, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "remote", + "object_b": "clock", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "A simple front view captures a setting where a sleek digital clock sits prominently. To the right of the clock lies a modern remote control. The clock has a clear digital display showing the current time in bright numbers, while the remote features a series of buttons arranged in neat rows. The scene is softly lit, emphasizing the minimalistic design and clean lines of both objects. The remote is slightly angled, adding a sense of depth to the composition." + }, + { + "prompt_en": "a clock on the left of a vase, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "clock", + "object_b": "vase", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a well-lit room, a decorative vase stands prominently on a surface, with a clock positioned to its left. The vase is elegantly designed with intricate patterns and vibrant colors. The clock features a classic round design with a clear white face and black numerals, framed by a sleek metallic rim. Both items are captured from a front-view perspective. The camera remains steady, providing a clear, focused view of the clock and vase, highlighting their contrasting yet complementary aesthetic qualities." + }, + { + "prompt_en": "a vase on the right of scissors, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "vase", + "object_b": "scissors", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In the foreground, a pair of scissors is positioned on the left side of the frame. The scissors are open, showcasing their sleek metallic blades. To the right of the scissors, a stylish vase stands elegantly. The vase is of medium height with a smooth, glossy surface and a subtle pattern around its midsection. It is empty, emphasizing its graceful form against a soft background. The lighting is gentle, casting faint shadows and highlighting the contrast between the metallic sheen of the scissors and the polished texture of the vase. The camera remains steady, capturing a clear frontal view of both objects." + }, + { + "prompt_en": "scissors on the left of a teddy bear, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "scissors", + "object_b": "teddy bear", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "From a front view, a teddy bear sits adorably in the center of the frame, with its soft, fluffy fur and endearing expression. To the left of the teddy bear, a pair of metal scissors lies on a flat surface. The scissors have shiny silver blades and black handles, contrasting with the plush texture of the bear. Soft, natural lighting highlights the scene, adding warmth and creating subtle shadows behind the objects." + }, + { + "prompt_en": "a teddy bear on the right of a potted plant, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "teddy bear", + "object_b": "potted plant", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a brightly lit room with natural light streaming in, a fluffy teddy bear sits to the right of a vibrant potted plant. The bear, with its soft brown fur and adorable stitched smile, is positioned to face the camera directly. Its black button eyes gleam softly. Beside the teddy bear, the potted plant displays lush green leaves. The pot is simple and earthy, complementing the lively greenery. The scene’s atmosphere is calm and inviting, evoking a sense of warmth and coziness." + }, + { + "prompt_en": "a frisbee on the left of a sports ball, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "frisbee", + "object_b": "sports ball", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In the scene, a bright red frisbee is positioned on the left side of a blue and white sports ball. Both are resting on a grassy field, with sunlight casting soft shadows around them. The camera captures a front view, focusing on the contrast between the smooth surface of the frisbee and the textured pattern of the sports ball. The lush green grass adds a vibrant and lively background to the composition." + }, + { + "prompt_en": "a sports ball on the right of a baseball bat, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "sports ball", + "object_b": "baseball bat", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a well-lit room with a wooden floor, a sports ball rests to the right of a baseball bat, both positioned on the ground. The perspective is a direct front view, ensuring both objects are clearly visible. The baseball bat is positioned horizontally, with its handle slightly elevated and its polished wood surface reflecting the light. The sports ball is round, with visible seams that catch the light, providing a sharp contrast to the bat. The camera remains steady, capturing the two objects in their entirety against the simple, clean floor and a softly blurred background." + }, + { + "prompt_en": "a baseball bat on the left of a baseball glove, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "baseball bat", + "object_b": "baseball glove", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a well-lit scene, a baseball bat and a baseball glove are displayed side by side. The wooden baseball bat is sleek and polished, positioned on the left side. Its surface reflects the lighting, highlighting the bat's smooth texture. The dark brown leather baseball glove is carefully placed to the right of the bat, with its fingers slightly open, showcasing intricate stitching details. The background features a simple, neutral surface that emphasizes the bat and glove in their entirety. The camera remains stationary, providing a clear, front-facing view of these two iconic sports items." + }, + { + "prompt_en": "a baseball glove on the right of a tennis racket, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "baseball glove", + "object_b": "tennis racket", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "A baseball glove and a tennis racket are positioned in front of a neutral background, creating a balanced composition. The baseball glove is on the right, made from rich brown leather with intricate stitching and patterns on its surface. Next to it, on the left, lies a tennis racket with a sleek design, featuring a black handle and a netted face. Both items are arranged on a flat surface, showcased from a front-facing angle. The scene is evenly lit, highlighting the textures and details of the sports equipment." + }, + { + "prompt_en": "a tennis racket on the left of a frisbee, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "tennis racket", + "object_b": "frisbee", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a front-view scene, a tennis racket is positioned on the left of a brightly colored frisbee. The tennis racket has a sleek design, with a black handle and a mesh string pattern, while the frisbee is vivid in appearance, featuring a striking neon green color. Both objects are placed against a clean, simple background, providing a clear focus on their distinct shapes and textures. The camera remains steady, capturing this dynamic composition with a sharp and vivid focus." + }, + { + "prompt_en": "a toilet on the left of a hair drier, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "toilet", + "object_b": "hair drier", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "The scene is a bathroom interior where a sleek, modern toilet is positioned on the left side. To the right of the toilet, mounted on the wall, is a compact hair dryer. The toilet features a smooth, white ceramic finish, while the hair dryer is silver with a nozzle pointing slightly downward. The setting is brightly lit, highlighting the clean and minimalist design. The camera maintains a stable front view, capturing both the toilet and hair dryer clearly." + }, + { + "prompt_en": "a hair drier on the right of a toothbrush, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "hair drier", + "object_b": "toothbrush", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In a brightly lit bathroom setting, a hair dryer is positioned on the right side of a standing toothbrush. The hair dryer has a sleek, modern design with a shiny black finish, and it rests slightly angled. The toothbrush, with a blue and white handle, stands upright in a small holder. Both items are set against a neutral background with a hint of shadow cast by the overhead lighting, creating a clean and orderly atmosphere. The camera captures this scene from a front view, allowing a clear perspective of both the hair dryer and the toothbrush." + }, + { + "prompt_en": "a toothbrush on the left of a sink, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "toothbrush", + "object_b": "sink", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "The scene features a close-up, front view of a bathroom sink, with a toothbrush placed to the left of it. The sink is shiny, with a smooth porcelain surface and a chrome faucet glistening under bright bathroom lighting. The toothbrush has a sleek handle with contrasting colors and soft bristles, angled slightly as it rests on the countertop. The camera remains steady, focusing on the toothbrush and sink, creating a clean and fresh atmosphere." + }, + { + "prompt_en": "a sink on the right of a toilet, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "sink", + "object_b": "toilet", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "The scene shows a front view of a bathroom with a toilet on the left and a sink on the right, illuminated by soft, even lighting. The toilet has a classic design with a white seat and lid, while the sink is mounted on a simple vanity, complete with a sleek chrome faucet. The surface of the vanity is clean and uncluttered. A neutral-colored tile wall serves as the backdrop, creating a clean and modern atmosphere. The camera is positioned to capture both the toilet and sink in a balanced composition for a tidy and clear view." + }, + { + "prompt_en": "a chair on the left of a couch, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "chair", + "object_b": "couch", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a well-lit room with soft, diffused lighting, a cozy, stylish couch is positioned in the center. To its left, there is an elegant chair with a smooth, upholstered design. The chair features a sleek, modern look, complementing the couch's comfortable and inviting appearance. The camera is set at a frontal view, capturing both the chair and the couch in their entirety. This arrangement creates a balanced and harmonious atmosphere, emphasizing the stylish furniture setup." + }, + { + "prompt_en": "a couch on the right of a bed, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "couch", + "object_b": "bed", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "In the scene, a cozy and stylish living space is showcased. To the right side of the frame is a plush, modern couch upholstered in a light grey fabric. Its cushions are slightly indented, suggesting comfort and inviting relaxation. To the left, a neatly made bed with a soft white duvet and a few decorative pillows is visible. The scene is lit with soft, ambient light, creating a warm and inviting atmosphere. The overall setting exudes a sense of tranquility and comfort. The camera is positioned to capture the front view of this arrangement, highlighting the harmonious balance between the couch and the bed." + }, + { + "prompt_en": "a bed on the left of a tv, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "bed", + "object_b": "tv", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "The video captures a bedroom from the front view. On the left side, there is a neatly made bed with a plush, cream-colored comforter and an array of decorative pillows. The bed's headboard is simple yet elegant, complementing the room's cozy atmosphere. To the right of the bed, a modern flat-screen TV is mounted on the wall. The TV displays a still image of a serene ocean scene. The room is softly lit, creating a warm and inviting ambiance. The camera holds steady, showcasing the harmonious arrangement and detailing of the bedroom." + }, + { + "prompt_en": "a tv on the right of a dining table, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "tv", + "object_b": "dining table", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "The scene features a dining table positioned in the center, with a sleek flat-screen TV set to the right. The table is elegantly set with a crisp, white tablecloth and has a simple flower arrangement. The TV, mounted on the wall, displays a vibrant and colorful image. The room is softly lit, creating an inviting and cozy atmosphere. A gentle camera pan from left to right focuses on both the dining setup and the TV, capturing the harmonious layout of the space." + }, + { + "prompt_en": "a dining table on the left of a chair, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "dining table", + "object_b": "chair", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "The scene features a dining table positioned to the left of a plush, cushioned chair, both viewed from the front. The table is elegantly set, with a neatly arranged tablecloth, polished silverware, and a small floral centerpiece adding charm to the setting. The lighting is warm and inviting, casting soft shadows on the table and chair. The cushioned chair has a smooth fabric with a subtle pattern, offering both comfort and style to the setting. The camera captures the scene with a gentle zoom, focusing on the harmonious arrangement." + }, + { + "prompt_en": "an airplane on the left of a train, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "airplane", + "object_b": "train", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In a dynamic scene from the front view, an airplane is positioned to the left of a train. The airplane is sleek and modern, with a metallic finish that gleams under bright daylight. Its nose is slightly tilted upward, ready for takeoff, with engines appearing powerful and ready. On the right, the train is sturdy and substantial, with a pronounced front and large headlights. The red metal of the train contrasts with the airplane's finish. Both the airplane and the train stretch into the distance, creating a feeling of movement and anticipation. The camera gradually pans closer, enhancing the sense of scale and momentum." + }, + { + "prompt_en": "a train on the right of a boat, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "train", + "object_b": "boat", + "relationship": "on the right of" + } + } + }, + "refined_prompt": "From a front-view perspective, a train and a boat are positioned side by side. The train, sleek and modern with a shiny exterior, is situated on the right. It has large windows and a bright headlight that illuminates the track ahead. The boat, on the left, is a small, classic wooden vessel with a polished finish, resting steadily on calm water. The scene is set against a backdrop of a clear sky, creating a harmonious and peaceful atmosphere. The camera remains steady, focusing on both the train and the boat as they align in perfect symmetry." + }, + { + "prompt_en": "a boat on the left of an airplane, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "boat", + "object_b": "airplane", + "relationship": "on the left of" + } + } + }, + "refined_prompt": "In the scene, a small boat is visible to the left of a large airplane, both facing forward. The airplane dominates the background with its sleek, streamlined body and massive wings, while the boat appears smaller but equally striking, floating on calm water. The boat is characterized by its white hull and details on the sides, contrasting against the blue water below. The camera maintains a centered front view, capturing the juxtaposition between the two different vehicles. The lighting is soft, suggesting either early morning or late afternoon, adding a serene and balanced atmosphere to the composition." + }, + { + "prompt_en": "an oven on the top of a toaster, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "oven", + "object_b": "toaster", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "In a well-lit kitchen setting, an oven is stacked on top of a toaster, creating an unusual arrangement. From the front view, the stainless steel oven is prominently featured, highlighting its sleek design with a clear glass door. Below it, the toaster displays its two slots for toasting bread, with gleaming metallic finishes. The knobs and controls of both appliances are visible, adding to the scene's realistic detail. The scene is captured in a warm ambient lighting that casts gentle shadows, emphasizing the contours and textures of the appliances." + }, + { + "prompt_en": "an oven on the bottom of a toaster, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "oven", + "object_b": "toaster", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "In a well-lit kitchen setting, a compact oven is positioned below a toaster in a vertical stack. The front view shows the oven's sleek silver exterior, featuring a transparent glass door with a metallic handle. Above, the toaster has a polished stainless steel finish and two wide slots for toasting bread. The kitchen counter is visible around the appliances, providing a tidy and stylish atmosphere. The camera remains steady, showcasing the neat and efficient arrangement of the oven and toaster, highlighting both appliances' modern design." + }, + { + "prompt_en": "a toaster on the top of a microwave, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "toaster", + "object_b": "microwave", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "A silver toaster is neatly positioned on top of a sleek black microwave, both viewed from the front. The toaster is modern in design, with a shiny metal exterior and two wide slots visible from this angle. Its control knobs and levers are polished and add to its clean appearance. The microwave below features a glossy finish with a digital display and a clear door. The lighting in the scene is warm, highlighting the reflective surfaces and creating a cozy kitchen atmosphere. The camera remains steady to capture the neat arrangement of these appliances." + }, + { + "prompt_en": "a toaster on the bottom of a microwave, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "toaster", + "object_b": "microwave", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "In a cozy kitchen setting, a toaster is positioned directly underneath a microwave, both viewed from the front. The toaster, with its sleek metallic design, features two slots on the top, hinting at its purpose. Above it, the microwave showcases a digital display and a row of buttons, adding a modern touch to the setting. The arrangement sits on a kitchen counter, with a tiled backsplash providing a subtle background. The lighting is warm, giving the appliances a welcoming and homey atmosphere. The camera is steady, capturing a clear and unobstructed view of both appliances." + }, + { + "prompt_en": "a microwave on the top of an oven, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "microwave", + "object_b": "oven", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "In a brightly lit kitchen, a sleek microwave is positioned on top of a stainless-steel oven, both shown from a front view. The microwave features a digital display and a row of buttons on the right side, its door designed with a glossy finish. Underneath, the oven has a large, clear glass door that reveals the interior, with a control panel above it. The light from the surrounding area casts a subtle reflection on both appliances, giving them a modern and clean appearance. The camera remains steady, capturing the elegance and functionality of the appliances in this contemporary kitchen setting." + }, + { + "prompt_en": "a microwave on the bottom of an oven, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "microwave", + "object_b": "oven", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "The video focuses on a microwave situated beneath an oven, capturing a front view of the appliance setup. The microwave features a sleek stainless-steel exterior, with a digital control panel on the right side displaying a bright LED time display and several buttons for various cooking settings. The oven above the microwave has a glossy finish with visible knobs and a wide glass door, showcasing its modern design. The camera slowly zooms in, accentuating the clean lines and contemporary style of both the microwave and oven. The lighting is bright and even, highlighting the appliances' polished surfaces." + }, + { + "prompt_en": "a banana on the top of an apple, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "banana", + "object_b": "apple", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "In a well-lit setting, a ripe, slightly curved banana is balanced on top of a bright red apple. The smooth, yellow peel of the banana contrasts with the glossy surface of the apple. The camera captures this front view, focusing on the arrangement, with a neutral background that highlights the vibrant colors of the fruit. The positioning of the banana and apple creates a harmonious balance, centering them perfectly within the frame. The lighting accentuates the textures and colors, adding a fresh and inviting atmosphere to the scene." + }, + { + "prompt_en": "a banana on the bottom of an apple, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "banana", + "object_b": "apple", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "In a bright, evenly lit setting, a ripe yellow banana is carefully balanced on the bottom of a shiny red apple. The scene is captured from the front view, highlighting the contrasting colors and textures of the fruits. The banana's smooth, curved shape gently rests on the apple's rounded surface, creating an interesting visual contrast between the two. The background is simple and unobtrusive, ensuring full attention on the playful arrangement of the banana and apple. The camera maintains a steady position to emphasize the unique composition." + }, + { + "prompt_en": "an apple on the top of a sandwich, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "apple", + "object_b": "sandwich", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "The video starts with a close-up front view of an apple placed on the top of a sandwich. The apple is bright red and glossy, adding a pop of color to the scene. The sandwich underneath consists of toasted bread with visible layers of lettuce, tomato, and deli meat peeking out. The camera slowly zooms in to highlight the details and textures of both the apple and the sandwich, before gently pulling back to capture the entire composition. The lighting is soft, emphasizing the fresh and appetizing appearance of the food." + }, + { + "prompt_en": "an apple on the bottom of a sandwich, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "apple", + "object_b": "sandwich", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "In a front view, a red, glossy apple is positioned at the bottom of a sandwich, providing a unique and quirky twist. The sandwich layers above the apple consist of leafy green lettuce, slices of ripe tomato, and a thick piece of deli meat, all held together by two slices of toasted bread. The textures are varied, with the smooth apple contrasting against the leafy and slightly textured sandwich toppings. The lighting highlights the vibrant colors and freshness of the ingredients, creating an appetizing and interesting composition." + }, + { + "prompt_en": "a sandwich on the top of an orange, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "sandwich", + "object_b": "orange", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "From a front view, a sandwich is artfully balanced on top of a vibrant orange. The sandwich, layered with fresh ingredients, features crisp lettuce, juicy tomato slices, and a hint of creamy dressing, all nestled between lightly toasted bread. The bright orange beneath adds a pop of color, its textured rind contrasting with the smooth edges of the sandwich. The scene is well-lit, highlighting the freshness and vividness of both the sandwich and the orange. The camera maintains a steady focus on this unique culinary arrangement." + }, + { + "prompt_en": "a sandwich on the bottom of an orange, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "sandwich", + "object_b": "orange", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A sandwich, composed of golden-brown toasted bread, lies horizontally on the bottom half of a halved orange. The vibrant orange flesh peeks out, contrasting with the sandwich's toasted surface. The camera captures this from the front, focusing closely on the textured bread and the vibrant color of the orange, with natural daylight illuminating the scene and highlighting the vivid juxtaposition of colors and textures." + }, + { + "prompt_en": "an orange on the top of a carrot, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "orange", + "object_b": "carrot", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "An orange is precariously balanced atop the narrow end of a carrot, presenting a visually intriguing front view. The carrot, with its rich, earthy orange hue, contrasts with the vibrant, slightly textured surface of the orange. The background is softly lit, creating a simplistic yet striking composition that draws attention to the peculiar arrangement of the two foods. The camera holds steady in a tight shot, emphasizing the balance and alignment of the orange and carrot against the plain backdrop." + }, + { + "prompt_en": "an orange on the bottom of a carrot, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "orange", + "object_b": "carrot", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A vibrant orange is neatly balanced on the broader end of a carrot, forming an unusual stacked arrangement. From the front view, the orange sits securely on top of the carrot, which points upwards. The orange's textured peel contrasts with the smooth, tapered surface of the carrot. The scene is brightly lit, highlighting the natural colors and textures of both the orange and carrot. The white background emphasizes the playful and peculiar composition. The camera remains steady, capturing this intriguing visual combination." + }, + { + "prompt_en": "a carrot on the top of a hot dog, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "carrot", + "object_b": "hot dog", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "A hot dog sits in a neatly arranged bun, with a vibrant orange carrot placed prominently on top, where a sausage would typically be. The carrot's smooth surface contrasts with the bun's soft texture. The bun is lightly toasted, giving it a golden-brown hue. The camera captures a front view, focusing on the hot dog's slightly tilted arrangement. This angle emphasizes the unusual topping, creating an intriguing and appetizing visual. The background is softly blurred, drawing attention to the carrot-topped hot dog." + }, + { + "prompt_en": "a carrot on the bottom of a hot dog, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "carrot", + "object_b": "hot dog", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A vibrant orange carrot is positioned at the bottom of a hot dog bun, creating an unusual take on the classic food. From the front view, the carrot's tapered end is visible, nestled neatly within the bun's soft, golden-brown halves. The camera captures the scene in a well-lit setting, highlighting the carrot's vivid color and the texture of the bun. The simple yet intriguing combination is centered in the frame, drawing attention to this unique culinary presentation." + }, + { + "prompt_en": "a hot dog on the top of a pizza, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "hot dog", + "object_b": "pizza", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "A close-up, front view shot showcases a hot dog resting directly on top of a pizza. The hot dog is nestled on the pizza, emphasizing its plump and juicy appearance. The pizza displays a golden crust with melted cheese, pepperoni slices, and vibrant green peppers scattered throughout. The camera is positioned at eye level, focusing on the hot dog, while the tantalizing combination of toppings on the pizza is clearly visible, creating an appetizing and colorful scene. The warm lighting adds to the inviting and delicious atmosphere." + }, + { + "prompt_en": "a hot dog on the bottom of a pizza, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "hot dog", + "object_b": "pizza", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A hot dog is placed at the bottom edge of a pizza, creating a visually striking combination of flavors. From the front view, the hot dog is nestled on the slightly raised, golden-brown crust. The pizza is generously topped with melted cheese and slices of pepperoni, glistening in the soft lighting. The hot dog's warm, rich texture contrasts with the pizza's savory ingredients, making it an intriguing culinary arrangement. The camera remains steady, capturing the entire ensemble as if presented on a wooden cutting board." + }, + { + "prompt_en": "a pizza on the top of a donut, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "pizza", + "object_b": "donut", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "From the front view, a large, delectable pizza is placed on top of a gigantic donut. The pizza is topped with a generous layer of melted cheese, pepperoni slices, and vibrant green bell peppers, adding a splash of color. The donut is frosted with a glossy, pink glaze, and colorful sprinkles are scattered over it, creating an enticing contrast to the savory pizza. The camera holds steady, capturing the unlikely combination of sweet and savory elements, providing a fascinating visual juxtaposition. " + }, + { + "prompt_en": "a pizza on the bottom of a donut, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "pizza", + "object_b": "donut", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "In a surreal and imaginative scene, from a front view, a savory pizza is artistically balanced on the bottom of a large glazed donut. The pizza, with rich tomato sauce and melted cheese, features colorful toppings such as pepperoni slices and green bell peppers. The donut, with its glossy surface and light golden hue, has a smooth and shiny glaze that catches the light. The scene is well-lit, emphasizing the contrasting textures and colors of the pizza and donut, creating a playful and whimsical atmosphere. The camera holds a steady focus, capturing the unique combination of these two foods." + }, + { + "prompt_en": "a donut on the top of broccoli, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "donut", + "object_b": "broccoli", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "In a well-lit kitchen setting, a fresh, vibrant green broccoli floret is positioned at the center of a clean, white plate. Atop the broccoli sits a perfectly round donut with a glossy pink glaze and colorful sprinkles. The donut is balanced delicately on the broccoli, creating a whimsical, unexpected visual contrast between the dessert and vegetable. The camera captures this unique pairing from a front view, focusing sharply on both the donut and the broccoli, ensuring their textures and colors are clearly defined against the simple background." + }, + { + "prompt_en": "a donut on the bottom of broccoli, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "donut", + "object_b": "broccoli", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A colorful donut sits atop a bed of vibrant green broccoli florets, offering a unique combination of colors and textures. The donut has a glossy frosting with rainbow sprinkles, adding a playful touch to the scene. The broccoli, with its rich color and dense texture, forms a fresh and healthy base. The background features a softly blurred setting to keep the focus on the donut and broccoli. The camera holds a steady front-facing view, capturing the contrasting elements in vivid detail." + }, + { + "prompt_en": "broccoli on the top of a banana, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "broccoli", + "object_b": "banana", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "A front view shows a piece of fresh broccoli balancing on top of a ripe yellow banana. The vibrant green of the broccoli contrasts with the banana's smooth, slightly curved peel. The banana is positioned upright, adding an element of tension to the composition. Soft lighting from the front highlights the textures, with the florets of the broccoli and the banana's surface in clear view. The background is minimalistic, emphasizing the unique pairing." + }, + { + "prompt_en": "broccoli on the bottom of a banana, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "broccoli", + "object_b": "banana", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "In a brightly lit setting, a front view reveals a whimsical combination of a broccoli crown positioned on the bottom of a banana. The vibrant green florets contrast sharply with the smooth, yellow surface of the banana peel. The banana is upright, with its curved shape adding an interesting dynamic to the unusual pairing. The background is a simple, neutral color to emphasize the unique fusion of the two foods. The camera remains stationary to allow a clear view of this imaginative creation." + }, + { + "prompt_en": "skis on the top of a snowboard, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "skis", + "object_b": "snowboard", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "In a close-up front view, a pair of skis is neatly positioned on top of a snowboard. The skis are aligned parallel to each other, with their sharp tips pointing forward and slightly upward. The snowboard, wider and with vibrant graphic designs, provides a colorful backdrop to the sleek, narrow skis. The lighting creates a reflective sheen on the surfaces, highlighting the textures of the metal edges and the smooth finish of both the skis and the snowboard. The camera stays focused to emphasize the contrast between the skis and the snowboard." + }, + { + "prompt_en": "skis on the bottom of a snowboard, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "skis", + "object_b": "snowboard", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "The scene features the front view of skis attached to the bottom of a snowboard, positioned on a snowy slope. The skis, sleek and glossy, are aligned parallel to each other, with the tips curving slightly upwards. Snowflakes cling to the surface, glistening in the bright daylight. The white snow provides a perfect clean backdrop, enhancing the contrast and details of the skis and snowboard." + }, + { + "prompt_en": "a snowboard on the top of a kite, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "snowboard", + "object_b": "kite", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "A snowboard is perched upside down on the top edge of a large, vibrant kite. The kite is brightly colored with shades of blue and yellow, creating a striking contrast against the wintry backdrop. In the front view, the snowboard is prominently displayed, its sleek, glossy surface catching the light. The string and structure of the kite are visible, showcasing intricate design details. The snow-covered landscape in the background adds a crisp, cold atmosphere, with the wind gently rustling the kite’s fabric. The camera slowly zooms in to capture the texture and details of both the snowboard and kite." + }, + { + "prompt_en": "a snowboard on the bottom of a kite, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "snowboard", + "object_b": "kite", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A close-up, front view reveals a snowboard attached to the bottom of a colorful, vibrant kite. The kite's fabric displays a striking array of colors, with a captivating pattern that catches the eye. The snowboard, a sleek and aerodynamic design, has a glossy finish, reflecting the sunlight. The snowy landscape in the background provides vivid contrast to the bright colors of the kite, enhancing the overall dynamic and energetic feel of the scene. The camera remains steady to capture every intricate detail of the snowboard and kite as they sway slightly in the wind." + }, + { + "prompt_en": "a kite on the top of a skateboard, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "kite", + "object_b": "skateboard", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "A brightly colored kite with a vibrant red, yellow, and blue diamond-shaped pattern rests on top of a skateboard, viewed from the front. The skateboard's deck features a dark, textured wood finish, while its wheels are a vivid shade of orange. The kite's tail, made of alternating colored ribbons, drapes over the sides of the skateboard, creating a playful and eye-catching contrast. The scene captures the objects in clear daylight, accentuating their colors and patterns against a neutral backdrop. The camera gently zooms in to focus more closely on the details of the kite and skateboard." + }, + { + "prompt_en": "a kite on the bottom of a skateboard, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "kite", + "object_b": "skateboard", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "A skateboard is positioned in a way that its bottom is clearly visible from the front view, revealing a colorful kite design. The kite features vibrant patterns in shades of blue, red, and yellow, with sweeping shapes that convey a sense of motion. The skateboard's wheels are also visible and have a clean, polished look. The surface of the skateboard sits against a simple, neutral background that highlights the vivid colors of the kite design. The camera angle is perfectly centered, providing a symmetrical and balanced view of the skateboard's underside." + }, + { + "prompt_en": "a skateboard on the top of a surfboard, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "skateboard", + "object_b": "surfboard", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "The video captures a front view of a skateboard resting perfectly balanced on the top of a surfboard. The surfboard has a sleek, glossy finish with a vibrant blue stripe running down its center, while the skateboard features a classic wooden deck with black grip tape. Both are set against a clear background, providing a sharp contrast that highlights their distinct shapes and textures. The lighting accentuates the skateboard's metallic trucks and wheels, casting subtle reflections on the surfboard's glossy surface. The camera remains steady, focusing closely on this unusual yet harmonious pairing." + }, + { + "prompt_en": "a skateboard on the bottom of a surfboard, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "skateboard", + "object_b": "surfboard", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "The video shows a close-up, front view of a skateboard affixed to the bottom of a surfboard. The contrasting textures are striking, with the smooth, glossy surface of the surfboard sharply juxtaposed against the rougher surface of the skateboard deck. The skateboard's wheels and trucks are clearly visible, neatly attached to the underside of the surfboard. In the background, there's a soft oceanic ambiance, with gentle waves lapping at the shore under a subtly clouded sky." + }, + { + "prompt_en": "a surfboard on the top of skis, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "surfboard", + "object_b": "skis", + "relationship": "on the top of" + } + } + }, + "refined_prompt": "The scene captures a surfboard precariously balanced atop a pair of skis from a front view. The surfboard, with a bright and colorful design of blue and yellow, rests horizontally across the skis. The skis, which are red with black bindings, are positioned vertically with their tips slightly pointed outward. The entire setup is positioned against a clear sky, giving the impression of a creative and unusual combination of sports equipment. The camera remains stationary, emphasizing the unique juxtaposition of the surfboard and skis." + }, + { + "prompt_en": "a surfboard on the bottom of skis, front view", + "dimension": [ + "spatial_relationship" + ], + "auxiliary_info": { + "spatial_relationship": { + "spatial_relationship": { + "object_a": "surfboard", + "object_b": "skis", + "relationship": "on the bottom of" + } + } + }, + "refined_prompt": "From a front view, a surfboard is ingeniously attached to the bottom of a pair of skis, creating a unique and unconventional design. The curved front tip of the surfboard is visible, combining the sleek surface of the board with the parallel lines of the skis. The surface of the skis has a glossy finish that reflects light, while the surfboard's texture adds an interesting contrast. The overall composition is centered, showcasing the innovative combination of the two sporting equipment pieces. The lighting is bright and clear, emphasizing the details of the design." + } +] \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/__init__.py b/Meissonic/InfinityStar/infinity/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68af73834914377d02a72e2a8b5c04781718ac --- /dev/null +++ b/Meissonic/InfinityStar/infinity/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ccaac29a0b4eb8df0cfa5fa901177a47eebc486 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/dataset/__pycache__/build.cpython-310.pyc b/Meissonic/InfinityStar/infinity/dataset/__pycache__/build.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..341b0ef3056f7c1458ecf6b68659b99da7477a71 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/dataset/__pycache__/build.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/dataset/__pycache__/dataset_joint_vi.cpython-310.pyc b/Meissonic/InfinityStar/infinity/dataset/__pycache__/dataset_joint_vi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fabd4ff5a1a89cddee12ad2f68fa06a818a9be21 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/dataset/__pycache__/dataset_joint_vi.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/dataset/build.py b/Meissonic/InfinityStar/infinity/dataset/build.py new file mode 100644 index 0000000000000000000000000000000000000000..de77521f4ddaea5049721029443e4e01d08d297b --- /dev/null +++ b/Meissonic/InfinityStar/infinity/dataset/build.py @@ -0,0 +1,218 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import datetime +import os +import os.path as osp +import random +import subprocess +from functools import partial +from typing import Optional +import time + +import pytz + +from infinity.dataset.dataset_joint_vi import JointViIterableDataset +from infinity.utils.sequence_parallel import SequenceParallelManager as sp_manager + +try: + from grp import getgrgid + from pwd import getpwuid +except: + pass +import PIL.Image as PImage +from PIL import ImageFile +import numpy as np +from torchvision.transforms import transforms +from torchvision.transforms.functional import resize, to_tensor +import torch.distributed as tdist + +from torchvision.transforms import InterpolationMode +bicubic = InterpolationMode.BICUBIC +lanczos = InterpolationMode.LANCZOS +PImage.MAX_IMAGE_PIXELS = (1024 * 1024 * 1024 // 4 // 3) * 5 +ImageFile.LOAD_TRUNCATED_IMAGES = False + + +def time_str(fmt='[%m-%d %H:%M:%S]'): + return datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime(fmt) + + +def normalize_01_into_pm1(x): # normalize x from [0, 1] to [-1, 1] by (x*2) - 1 + return x.add(x).add_(-1) + + +def denormalize_pm1_into_01(x): # denormalize x from [-1, 1] to [0, 1] + return x.add(1).mul_(0.5) + + +def center_crop_arr(pil_image, image_size): + """ + Center cropping implementation from ADM. + https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126 + """ + while min(*pil_image.size) >= 2 * image_size: + pil_image = pil_image.resize( + tuple(x // 2 for x in pil_image.size), resample=PImage.BOX + ) + + scale = image_size / min(*pil_image.size) + pil_image = pil_image.resize( + tuple(round(x * scale) for x in pil_image.size), resample=PImage.LANCZOS + ) + + arr = np.array(pil_image) + crop_y = (arr.shape[0] - image_size) // 2 + crop_x = (arr.shape[1] - image_size) // 2 + return PImage.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size]) + + +class RandomResize: + def __init__(self, mid_reso, final_reso, interpolation): + ub = max(round((mid_reso + (mid_reso-final_reso) / 8) / 4) * 4, mid_reso) + self.reso_lb, self.reso_ub = final_reso, ub + self.interpolation = interpolation + + def __call__(self, img): + return resize(img, size=random.randint(self.reso_lb, self.reso_ub), interpolation=self.interpolation) + + def __repr__(self): + return f'RandomResize(reso=({self.reso_lb}, {self.reso_ub}), interpolation={self.interpolation})' + + +def load_save(reso=512): + import os + from PIL import Image as PImage + from torchvision.transforms import transforms, InterpolationMode + aug = transforms.Compose([ + transforms.Resize(512, interpolation=InterpolationMode.LANCZOS), + transforms.CenterCrop((512, 512)) + ]) + src_folder = r'C:\Users\16333\Pictures\imgs_to_visual_v2' + ls = [os.path.join(src_folder, x) for x in ('1.jpg', '2.jpg', '3.png', '4.png', '5.png')] + print(ls) + imgs = [] + for i, fname in enumerate(ls): + assert os.path.exists(fname) + with PImage.open(fname) as img: + img = img.convert('RGB') + img = aug(img) + imgs.append(img) + dst_d, dst_f = os.path.split(fname) + dst = os.path.join(dst_d, f'crop{dst_f.replace(".jpg", ".png")}') + img.save(dst) + + W, H = imgs[0].size + WW = W * len(imgs) + new_im = PImage.new('RGB', (WW, H)) + x_offset = 0 + for img in imgs: + new_im.paste(img, (x_offset, 0)) + x_offset += W + dst = os.path.join(src_folder, f'junfeng.png') + new_im.save(dst) + + +def print_aug(transform, label): + print(f'Transform {label} = ') + if hasattr(transform, 'transforms'): + for t in transform.transforms: + print(t) + else: + print(transform) + print('---------------------------\n') + + +def build_t2i_dataset( + args, + data_path: str, + max_caption_len: int, + short_prob=0.2, + load_vae_instead_of_image=False +): + if args.use_streaming_dataset: + return T2IIterableDataset( + data_path, + max_caption_len=max_caption_len, + short_prob=short_prob, + load_vae_instead_of_image=load_vae_instead_of_image, + buffersize=args.iterable_data_buffersize, + pn=args.pn, + online_t5=args.online_t5, + batch_size=args.batch_size, + num_replicas=sp_manager.get_sp_group_nums() if sp_manager.sp_on() else tdist.get_world_size(), # 1, + rank = sp_manager.get_sp_group_rank() if sp_manager.sp_on() else tdist.get_rank(), + dataloader_workers=args.workers, + dynamic_resolution_across_gpus=args.dynamic_resolution_across_gpus, + enable_dynamic_length_prompt=args.enable_dynamic_length_prompt, + seed=args.seed, + dynamic_scale_schedule=args.dynamic_scale_schedule, + ) + else: + raise ValueError(f'args.use_streaming_dataset={args.use_streaming_dataset} unsupported') + + +def build_joint_dataset( + args, + image_data_path: str, + video_data_path: str, + max_caption_len: int, + short_prob=0.2, + load_vae_instead_of_image=False +): + if args.use_streaming_dataset: + return JointViIterableDataset( + image_meta_folder=image_data_path, + video_meta_folder=video_data_path, + max_caption_len=max_caption_len, + short_prob=short_prob, + load_vae_instead_of_image=load_vae_instead_of_image, + buffersize=args.iterable_data_buffersize, + pn=args.pn, + video_fps=args.video_fps, + num_frames=args.video_frames, + online_t5=args.online_t5, + num_replicas=sp_manager.get_sp_group_nums() if sp_manager.sp_on() else tdist.get_world_size(), # 1, + rank = sp_manager.get_sp_group_rank() if sp_manager.sp_on() else tdist.get_rank(), + dataloader_workers=args.workers, + dynamic_resolution_across_gpus=args.dynamic_resolution_across_gpus, + enable_dynamic_length_prompt=args.enable_dynamic_length_prompt, + dynamic_scale_schedule=args.dynamic_scale_schedule, + add_motion_score2caption=args.add_motion_score2caption, + seed=args.seed, + other_args=args, + ) + else: + raise ValueError(f'args.use_streaming_dataset={args.use_streaming_dataset} unsupported') + +def pil_load(path: str, proposal_size): + with open(path, 'rb') as f: + img: PImage.Image = PImage.open(f) + w: int = img.width + h: int = img.height + sh: int = min(h, w) + if sh > proposal_size: + ratio: float = proposal_size / sh + w = round(ratio * w) + h = round(ratio * h) + img.draft('RGB', (w, h)) + img = img.convert('RGB') + return img + + +def rewrite(im: PImage, file: str, info: str): + kw = dict(quality=100) + if file.lower().endswith('.tif') or file.lower().endswith('.tiff'): + kw['compression'] = 'none' + elif file.lower().endswith('.webp'): + kw['lossless'] = True + + st = os.stat(file) + uname = getpwuid(st.st_uid).pw_name + gname = getgrgid(st.st_gid).gr_name + mode = oct(st.st_mode)[-3:] + + local_file = osp.basename(file) + im.save(local_file, **kw) + print(f'************* ************* @ {file}') + subprocess.call(f'sudo mv {local_file} {file}; sudo chown {uname}:{gname} {file}; sudo chmod {mode} {file}', shell=True) diff --git a/Meissonic/InfinityStar/infinity/dataset/dataset_joint_vi.py b/Meissonic/InfinityStar/infinity/dataset/dataset_joint_vi.py new file mode 100644 index 0000000000000000000000000000000000000000..730a90ece78eb2e972c72aafa9ff08b8a7dd7387 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/dataset/dataset_joint_vi.py @@ -0,0 +1,689 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import glob +import os +import time +from os import path as osp +from typing import List, Tuple +import json +import hashlib +import copy +import collections + +import tqdm +import numpy as np +import torch +import pandas as pd +from decord import VideoReader +from PIL import Image as PImage +from torchvision.transforms.functional import to_tensor +from torch.utils.data import IterableDataset, DataLoader +import torch.distributed as tdist +from PIL import Image +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta +from infinity.utils.video_decoder import EncodedVideoDecord, EncodedVideoOpencv +from transformers import AutoTokenizer + +def transform(pil_img, tgt_h, tgt_w): + width, height = pil_img.size + if width / height <= tgt_w / tgt_h: + resized_width = tgt_w + resized_height = int(tgt_w / (width / height)) + else: + resized_height = tgt_h + resized_width = int((width / height) * tgt_h) + pil_img = pil_img.resize((resized_width, resized_height), resample=PImage.LANCZOS) + # crop the center out + arr = np.array(pil_img) + crop_y = (arr.shape[0] - tgt_h) // 2 + crop_x = (arr.shape[1] - tgt_w) // 2 + im = to_tensor(arr[crop_y: crop_y + tgt_h, crop_x: crop_x + tgt_w]) + # print(f'im size {im.shape}') + return im.add(im).add_(-1) + +def get_prompt_id(prompt): + md5 = hashlib.md5() + md5.update(prompt.encode('utf-8')) + prompt_id = md5.hexdigest() + return prompt_id + +def prepend_motion_score(prompt, motion_score): + return f'<<>> {prompt}' + +class VideoReaderWrapper(VideoReader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.seek(0) + def __getitem__(self, key): + frames = super().__getitem__(key) + self.seek(0) + return frames + + +class JointViIterableDataset(IterableDataset): + def __init__( + self, + video_meta_folder: str = '', + buffersize: int = 1000000 * 300, + seed: int = 0, + pn: str = '', + video_fps: int = 1, + num_replicas: int = 1, + rank: int = 0, + dataloader_workers: int = 2, + dynamic_resolution_across_gpus: bool = True, + enable_dynamic_length_prompt: bool = True, + shuffle: bool = True, + short_prob: float = 0.2, + verbose=False, + temp_dir= "/dev/shm", + add_motion_score2caption=False, + other_args=None, + **kwargs, + ): + self.video_meta_folder = video_meta_folder + self.pn = pn + self.verbose = verbose + self.buffer_size = buffersize + self.num_replicas = num_replicas + self.rank = rank + self.worker_id = 0 + self.global_worker_id = 0 + self.short_prob = short_prob + self.dataloader_workers = max(1, dataloader_workers) + self.shuffle = shuffle + self.global_workers = self.num_replicas * self.dataloader_workers + self.add_motion_score2caption = add_motion_score2caption + self.seed = seed + self.text_tokenizer = other_args.text_tokenizer + self.feature_extraction = other_args.cache_check_mode < 0 # no sequence packing, for feature extraction + self.epoch_generator = None + self.epoch_worker_generator = None + self.epoch_global_worker_generator = None + self.epoch_rank_generator = None + self.other_args = other_args + self.drop_long_video = other_args.drop_long_video + self.dynamic_resolution_across_gpus = dynamic_resolution_across_gpus + self.enable_dynamic_length_prompt = enable_dynamic_length_prompt + self.set_epoch(other_args.epoch) + self.temporal_compress_rate = other_args.temporal_compress_rate + self.dynamic_resolution_h_w, self.h_div_w_templates = get_dynamic_resolution_meta(other_args.dynamic_scale_schedule, other_args.video_frames) # here video_frames is the max video frames + self.train_h_div_w_list = self.h_div_w_templates + self.video_fps = video_fps + self.min_training_duration = (other_args.min_video_frames-1) // self.video_fps + self.max_training_duration = (other_args.video_frames-1) // self.video_fps + self.append_duration2caption = other_args.append_duration2caption + print(f"{self.rank=} dataset {self.seed=}, {self.append_duration2caption=} add_motion_score2caption={add_motion_score2caption}, {self.min_training_duration=} {self.max_training_duration=}, cache_check_mode={self.other_args.cache_check_mode}") + self.token_cache_dir = other_args.token_cache_dir + self.use_vae_token_cache = other_args.use_vae_token_cache + self.allow_online_vae_feature_extraction = other_args.allow_online_vae_feature_extraction + self.use_text_token_cache = other_args.use_text_token_cache + self.max_video_frames = other_args.video_frames + self.cached_video_frames = other_args.cached_video_frames # cached max video frames + self.image_batches_multiply = other_args.image_batches_multiply + self.down_size_limit = other_args.down_size_limit + self.addition_pn_list = json.loads(other_args.addition_pn_list) + self.video_caption_type = other_args.video_caption_type + self.train_max_token_len = other_args.train_max_token_len + self.duration_resolution = other_args.duration_resolution + self.append_duration2caption = other_args.append_duration2caption + self.device = other_args.device + print(f'self.down_size_limit: {self.down_size_limit}') + self.max_text_len = other_args.tlen + self.temp_dir = temp_dir.rstrip("/") + self.metas = self.get_meta() + self.batches, self.batch_nums = self.form_batches(self.metas) + print(f'{num_replicas=}, {rank=}, {dataloader_workers=}, {self.batch_nums=}, {self.drop_long_video=} {self.max_text_len=}') + + def append_duration_info(self, meta, mapped_duration): + meta['caption'] = f'<<>>' + meta['caption'] + return meta + + def get_captions_lens(self, captions): + if self.other_args.text_tokenizer_type == 'flan_t5': + tokens = self.other_args.text_tokenizer(text=captions, max_length=self.other_args.text_tokenizer.model_max_length, padding='max_length', truncation=True, return_tensors='pt') + mask = tokens.attention_mask.cuda(non_blocking=True) + lens: List[int] = mask.sum(dim=-1).tolist() + else: # umt5-xxl + ids, mask = self.other_args.text_tokenizer( captions, return_mask=True, add_special_tokens=True) + lens = mask.gt(0).sum(dim=1).tolist() + return lens + + def get_meta(self): + part_filepaths = sorted(glob.glob(osp.join(self.video_meta_folder, '*/*.jsonl'))) + self.epoch_generator.shuffle(part_filepaths) + print(f'jsonls sample: {part_filepaths[:4]}') + if self.num_replicas > 1: + part_filepaths = part_filepaths[self.rank::self.num_replicas] + + metas = [] + pbar = tqdm.tqdm(total=len(part_filepaths)) + mapped_duration2freqs = collections.defaultdict(int) + total, corrupt = 0, 0 + stop_read = False + rough_h_div_w = self.h_div_w_templates[np.argmin(np.abs((9/16-self.h_div_w_templates)))] + for part_filepath in part_filepaths: + if stop_read: + break + pbar.update(1) + with open(part_filepath, 'r', encoding='utf-8') as f: + lines = f.readlines() + for line in lines: + total += 1 + try: + meta = json.loads(line) + except Exception as e: + print(e) + corrupt += 1 + print(e, corrupt, total, corrupt/total) + continue + if 'h_div_w' in meta: + del meta['h_div_w'] + if 'video_path' in meta: + begin_frame_id, end_frame_id, fps = meta['begin_frame_id'], meta['end_frame_id'], meta['fps'] + real_duration = (end_frame_id - begin_frame_id) / fps + mapped_duration = int(real_duration / self.duration_resolution) * self.duration_resolution + if mapped_duration < self.min_training_duration: + continue + if mapped_duration > self.max_training_duration: + if self.drop_long_video: + continue + else: + mapped_duration = self.max_training_duration + caption_type = 'tarsier2_caption' + if ('MiniCPM_V_2_6_caption' in meta) and meta['MiniCPM_V_2_6_caption']: + caption_type = self.epoch_rank_generator.choice(['tarsier2_caption', 'MiniCPM_V_2_6_caption']) + meta['caption'] = meta[caption_type] + if self.enable_dynamic_length_prompt and (self.epoch_rank_generator.random() < self.short_prob): + meta['caption'] = self.random_drop_sentences(meta['caption']) + if 'quality_prompt' in meta: + meta['caption'] = meta['caption'] + ' ' + meta['quality_prompt'] + if self.append_duration2caption: + meta = self.append_duration_info(meta, mapped_duration) + assert meta['caption'] + sample_frames = int(mapped_duration * self.video_fps + 1) + pt = (sample_frames-1) // self.temporal_compress_rate + 1 + scale_schedule = self.dynamic_resolution_h_w[rough_h_div_w][self.pn]['pt2scale_schedule'][pt] + meta['sample_frames'] = sample_frames + elif 'image_path' in meta: + mapped_duration = -1 + scale_schedule = self.dynamic_resolution_h_w[rough_h_div_w][self.pn]['pt2scale_schedule'][1] + if not meta['text']: + meta['caption'] = meta['long_caption'] + elif not meta['long_caption']: + meta['caption'] = meta['text'] + else: + if self.epoch_rank_generator.random() < self.other_args.short_cap_prob: + meta['caption'] = meta['text'] + else: + meta['caption'] = meta['long_caption'] + if self.enable_dynamic_length_prompt and (self.epoch_rank_generator.random() < self.short_prob): + meta['caption'] = self.random_drop_sentences(meta['caption']) + else: + raise ValueError(f'video_path or image_path not exist in meta: {meta}') + + cum_visual_tokens = np.array(scale_schedule).prod(-1).cumsum() + meta['cum_text_visual_tokens'] = cum_visual_tokens + if self.other_args.cache_check_mode == 1: # check at the begining + if self.exists_cache_file(meta): + metas.append(meta) + elif self.other_args.cache_check_mode == -1: # select unexist, used for token cache + if not self.exists_cache_file(meta): + metas.append(meta) + else: + metas.append(meta) + mapped_duration2freqs[mapped_duration] += 1 + if (self.other_args.restrict_data_size > 0) and (len(metas) > self.other_args.restrict_data_size / self.num_replicas): + stop_read = True + break + + # metas = sorted(metas, key=lambda x: -x['text_visual_tokens']) + + # append text tokens + metas = self.append_text_tokens(metas) + + self.epoch_rank_generator.shuffle(metas) + for mapped_duration in sorted(mapped_duration2freqs.keys()): + freq = mapped_duration2freqs[mapped_duration] + proportion = freq / len(metas) * 100 + print(f'{mapped_duration=}, {freq=}, {proportion=:.1f}%') + return metas + + def append_text_tokens(self, metas, bucket_size=100): + t1 = time.time() + max_text_visual_tokens = -1 + pbar = tqdm.tqdm(total=len(metas) // bucket_size + 1, desc='append text tokens') + for bucket_id in range(len(metas) // bucket_size + 1): + pbar.update(1) + start = bucket_id * bucket_size + end = min(start + bucket_size, len(metas)) + if start >= end: + break + if self.feature_extraction: + lens = [0 for i in range(start, end)] + else: + captions = [metas[i]['caption'] for i in range(start, end)] + assert len(captions), f'{len(captions)=}' + lens = self.get_captions_lens(captions) + for i in range(start, end): + metas[i]['text_tokens'] = min(self.max_text_len, lens[i-start]) + metas[i]['cum_text_visual_tokens'] = metas[i]['cum_text_visual_tokens'] + metas[i]['text_tokens'] + metas[i]['text_visual_tokens'] = metas[i]['cum_text_visual_tokens'][-1] + max_text_visual_tokens = max(max_text_visual_tokens, metas[i]['text_visual_tokens']) + if not self.other_args.allow_less_one_elem_in_seq: + assert max_text_visual_tokens <= self.train_max_token_len, f'{self.train_max_token_len=} should > {max_text_visual_tokens=}' + t2 = time.time() + print(f'append text tokens: {t2-t1:.1f}s') + return metas + + def exists_cache_file(self, meta): + if 'image_path' in meta: + return osp.exists(self.get_image_cache_file(meta['image_path'])) + else: + if '/vdataset/clip' in meta['video_path']: # clip + cache_file = self.get_video_cache_file(meta['video_path'], 0, meta['end_frame_id']-meta['begin_frame_id'], self.video_fps) + else: + cache_file = self.get_video_cache_file(meta['video_path'], meta['begin_frame_id'], meta['end_frame_id'], self.video_fps) + return osp.exists(cache_file) + + def form_batches(self, metas): + st = time.time() + if self.feature_extraction: # no sequence packing, for feature extraction + batches = [[item] for item in range(len(metas))] + else: + batches = [] + has_been_used = [False for _ in range(len(metas))] + bucket_size = min(len(metas), self.other_args.seq_pack_bucket) + print(f'[data preprocess] form_batches form {len(metas)} metas, bucket_size={bucket_size}...') + step = len(metas) // bucket_size + 1 + for bucket_id in range(step): + left_ptr = bucket_id + while left_ptr < len(metas): + tmp_batch = [left_ptr] + tokens_remain = self.train_max_token_len - metas[left_ptr]['text_visual_tokens'] + left_ptr += step + while (left_ptr < len(metas)) and (metas[left_ptr]['text_visual_tokens'] <= tokens_remain): + if not has_been_used[left_ptr]: + has_been_used[left_ptr] = True + tokens_remain -= metas[left_ptr]['text_visual_tokens'] + tmp_batch.append(left_ptr) + left_ptr += step + tmp_ptr = left_ptr + step + while tmp_ptr < len(metas) and tokens_remain > 0: + if (not has_been_used[tmp_ptr]) and (metas[tmp_ptr]['text_visual_tokens'] <= tokens_remain): + has_been_used[tmp_ptr] = True + tokens_remain -= metas[tmp_ptr]['text_visual_tokens'] + tmp_batch.append(tmp_ptr) + tmp_ptr += step + + # 从text_tokens小于tokens_remain的数据中阶段选取序列填入,以提高利用率 + if tokens_remain > 0: + increase_seq_usage_times = 0 + while increase_seq_usage_times == 0 or (tokens_remain > self.max_text_len): + increase_seq_usage_times += 1 + if increase_seq_usage_times >= 3: break + select_map = {} + for ind in tmp_batch: + select_map[ind] = True + candidates = [] + min_val = 99999999 + for tmp_ind in range(bucket_id, len(metas), step): + if (metas[tmp_ind]['cum_text_visual_tokens'][0] <= tokens_remain) and (tmp_ind not in select_map): + import bisect + idx = bisect.bisect_right(metas[tmp_ind]['cum_text_visual_tokens'], tokens_remain) + if tokens_remain - metas[tmp_ind]['cum_text_visual_tokens'][idx-1] < min_val: + min_val = tokens_remain - metas[tmp_ind]['cum_text_visual_tokens'][idx-1] + candidates = [tmp_ind] + elif tokens_remain - metas[tmp_ind]['cum_text_visual_tokens'][idx-1] == min_val: + candidates.append(tmp_ind) + if len(candidates): + tmp_batch.append(self.epoch_rank_generator.choice(candidates)) + tokens_remain = min_val + else: + break + batches.append(tmp_batch) + if len(batches) % 1000 == 0: + print(f'form {len(batches)} batches, left_ptr={left_ptr}, len(metas)={len(metas)}') + batch_num = len(batches) + print(f'[data preprocess] form_batches done, got {len(batches)} batches, cost {time.time()-st:.2f}s') + try: + if self.num_replicas > 1: + batch_num = torch.tensor([batch_num], device=self.device) + if tdist.is_initialized(): + tdist.all_reduce(batch_num, op=tdist.ReduceOp.MIN) + batch_num = batch_num.item() + except Exception as e: + print(e) + batch_num = batch_num // self.dataloader_workers * self.dataloader_workers + print(f'[data preprocess] form_batches done, got {batch_num} batches') + return batches, batch_num + + def set_global_worker_id(self): + worker_info = torch.utils.data.get_worker_info() + if worker_info: + worker_total_num = worker_info.num_workers + worker_id = worker_info.id + else: + worker_id = 0 + worker_total_num = 1 + assert worker_total_num == self.dataloader_workers, print(worker_total_num, self.dataloader_workers) + self.worker_id = worker_id + self.global_worker_id = self.rank * self.dataloader_workers + worker_id + + def set_epoch(self, epoch): + self.epoch = epoch + self.set_generator() + + def set_generator(self, ): + self.epoch_generator = np.random.default_rng(self.seed + self.epoch) + self.epoch_worker_generator = np.random.default_rng(self.seed + self.epoch + self.worker_id) + self.epoch_global_worker_generator = np.random.default_rng(self.seed + self.epoch + self.global_worker_id) + self.epoch_rank_generator = np.random.default_rng(self.seed + self.epoch + self.rank) + + def __iter__(self): + self.set_global_worker_id() + self.set_generator() + self.epoch_rank_generator.shuffle(self.batches) + yield_data_cnt = 0 + batch_ind_ptr = self.worker_id + failed_batch_cnt = 0 + last_yield_data_time = time.time() + while yield_data_cnt < self.batch_nums // self.dataloader_workers: + # if True: + try: + if time.time() - last_yield_data_time > 600: + raise ValueError(f'[dataset] it takes too long to yield data, please check your code') + batch_inds = self.batches[batch_ind_ptr%len(self.batches)] + if self.other_args.cache_check_mode in [-2, 2, 3]: # -2, 2 means check vae token cache at each iteration + all_has_been_cached = True + all_has_not_been_cached = True + for j in batch_inds: + exist_status = self.exists_cache_file(self.metas[j]) + if exist_status: + all_has_not_been_cached = False + if not exist_status: + all_has_been_cached = False + if self.other_args.cache_check_mode == 2: # mush all example has been cached + if not all_has_been_cached: + batch_ind_ptr += self.dataloader_workers + continue + if self.other_args.cache_check_mode == -2: # must not all has been cached cached before + if all_has_been_cached: + batch_ind_ptr += self.dataloader_workers + # print(f"skipping batch_inds {batch_inds}") + continue + if self.other_args.cache_check_mode == 3: # at least one has been cached + if all_has_not_been_cached: + batch_ind_ptr += self.dataloader_workers + continue + + batch_data = [] + for j in batch_inds: + meta = self.metas[j] + if 'image_path' in meta: + ret, model_input = self.prepare_image_input(meta) + elif 'video_path' in meta: + ret, model_input = self.prepare_video_input(meta) + # if not ret: break + if ret: + batch_data.append(model_input) + if not len(batch_data): + batch_ind_ptr += self.dataloader_workers + continue + # raise ValueError(f'[dataset] prepare_video_input failed, continue, failed meta is {meta}') + + captions4images, captions4raw_features, images, raw_features_bcthw, feature_cache_files4images, text_features = [], [], [], [], [], [] + text_feature_cache_files = [] + addition_pn_images = {} + for item in batch_data: + if item['raw_features_cthw'] is None: + images.append(item['img_T3HW'].permute(1,0,2,3)) # # tchw -> cthw + for key in item: + if key.startswith('img_T3HW_'): + if key not in addition_pn_images: + addition_pn_images[key] = [] + addition_pn_images[key].append(item[key].permute(1,0,2,3)) + feature_cache_files4images.append(item['feature_cache_file']) + captions4images.append(item['text_input']) + else: + raw_features_bcthw.append(item['raw_features_cthw']) + captions4raw_features.append(item['text_input']) + text_feature_cache_files.append(item['text_feature_cache_file']) + captions = captions4images + captions4raw_features + assert len(batch_data), f'len(batch_data)={len(batch_data)}' + text_cond_tuple = None + yield { + 'captions': captions, + 'images': images, + 'addition_pn_images': addition_pn_images, + 'feature_cache_files4images': feature_cache_files4images, + 'raw_features_bcthw': raw_features_bcthw, + 'text_cond_tuple': text_cond_tuple, + 'text_feature_cache_files': text_feature_cache_files, + 'media': 'videos', + } + yield_data_cnt += 1 + batch_ind_ptr += self.dataloader_workers + del batch_data + del images + del captions + last_yield_data_time = time.time() + except Exception as e: + batch_ind_ptr += self.dataloader_workers + failed_batch_cnt += 1 + if failed_batch_cnt % 400 == 0: + print(f'failed_batch_cnt: {failed_batch_cnt}, yield_data_cnt: {yield_data_cnt}') + print(f'[dataset] error: {e}') + + def prepare_image_input(self, info) -> Tuple: + try: + img_path, text_input = osp.abspath(info['image_path']), info['caption'] + img_T3HW, raw_features_cthw, feature_cache_file, text_features_lenxdim, text_feature_cache_file = [None] * 5 + # text_input = process_short_text(text_input) + if self.use_text_token_cache: + text_feature_cache_file = osp.join(self.token_cache_dir, 'flan-t5-xl-official', get_prompt_id(text_input)+'.pt') + if osp.exists(text_feature_cache_file): + text_features_lenxdim = torch.load(text_feature_cache_file, weights_only=True) + + if self.add_motion_score2caption: + rand_motion_score = -1 + self.epoch_rank_generator.random() * 21.0 # -1.0 ~ 20.0 + text_input = prepend_motion_score(text_input, rand_motion_score) + if self.use_vae_token_cache: + feature_cache_file = self.get_image_cache_file(img_path) + if osp.exists(feature_cache_file): + try: + raw_features_cthw = torch.load(feature_cache_file, weights_only=True) + except Exception as e: + print(f'load cache file error: {e}') + os.remove(feature_cache_file) + if raw_features_cthw is None and (not self.allow_online_vae_feature_extraction): + return False, None + if raw_features_cthw is None: + with open(img_path, 'rb') as f: + img: PImage.Image = PImage.open(f) + w, h = img.size + h_div_w = h / w + h_div_w_template = self.h_div_w_templates[np.argmin(np.abs((h_div_w-self.h_div_w_templates)))] + tgt_h, tgt_w = self.dynamic_resolution_h_w[h_div_w_template][self.pn]['pixel'] + img = img.convert('RGB') + img_T3HW = transform(img, tgt_h, tgt_w) + img_T3HW = img_T3HW.unsqueeze(0) + assert img_T3HW.shape[1] == 3 + data_item = { + 'text_input': text_input, + 'img_T3HW': img_T3HW, + 'raw_features_cthw': raw_features_cthw, + 'feature_cache_file': feature_cache_file, + 'text_features_lenxdim': text_features_lenxdim, + 'text_feature_cache_file': text_feature_cache_file, + } + return True, data_item + except Exception as e: + print(f'prepare_image_input error: {e}') + return False, None + + def prepare_pair_image_input(self, info) -> Tuple: + pass + + def prepare_pair_video_input(self, info) -> Tuple: + tmp_info = copy.deepcopy(info) + tmp_info['video_path'] = info['win_video_path'] + win_flag, win_data_item = self.prepare_video_input(tmp_info) + assert win_data_item['raw_features_cthw'] is None + + tmp_info['video_path'] = info['lose_video_path'] + lose_flag, lose_data_item = self.prepare_video_input(tmp_info) + assert lose_data_item['raw_features_cthw'] is None + + flag = win_flag and lose_flag + img_T3HW = torch.stack([win_data_item['img_T3HW'], lose_data_item['img_T3HW']], dim=0) # [2,T,C,H,W] + win_data_item['img_T3HW'] = img_T3HW + return flag, win_data_item + + def prepare_video_input(self, info) -> Tuple: + filename, begin_frame_id, end_frame_id = ( + info["video_path"], + info["begin_frame_id"], + info["end_frame_id"], + ) + + if True: + # try: + img_T3HW, raw_features_cthw, feature_cache_file, text_features_lenxdim, text_feature_cache_file = None, None, None, None, None + img_T3HW_4additional_pn = {} + text_input = info['caption'] + if '/vdataset/clip' in filename: # clip + begin_frame_id, end_frame_id = 0, end_frame_id - begin_frame_id + sample_frames = info['sample_frames'] + if self.use_vae_token_cache: + feature_cache_file = self.get_video_cache_file(info["video_path"], begin_frame_id, end_frame_id, self.video_fps) + if osp.exists(feature_cache_file): + try: + pt = (sample_frames-1) // self.temporal_compress_rate + 1 + raw_features_cthw = torch.load(feature_cache_file, weights_only=True) + # _, tgt_h, tgt_w = self.dynamic_resolution_h_w[h_div_w_template][self.pn]['pt2scale_schedule'][1][-1] + # assert raw_features_cthw.shape[-2:] == (tgt_h, tgt_w), f'raw_features_cthw.shape[-2:] == (tgt_h, tgt_w): {raw_features_cthw.shape[-2:]} vs {(tgt_h, tgt_w)}' + assert raw_features_cthw.shape[1] >= pt, f'raw_features_cthw.shape[1] >= pt: {raw_features_cthw.shape[1]} vs {pt}' + if raw_features_cthw.shape[1] > pt: + raw_features_cthw = raw_features_cthw[:,:pt] + except Exception as e: + print(f'load video cache file error: {e}') + os.remove(feature_cache_file) + raw_features_cthw = None + if raw_features_cthw is None and (not self.allow_online_vae_feature_extraction): + return False, None + pn_list = [self.pn] + if raw_features_cthw is None: + local_path = info["video_path"] + if not local_path: return False, None + if not osp.exists(local_path): + return False, None + video = EncodedVideoOpencv(local_path, os.path.basename(local_path), num_threads=0) + # video = EncodedVideoDecord(local_path, os.path.basename(local_path), num_threads=0) + start_interval = max(0, begin_frame_id / video._fps) + end_interval = start_interval+(sample_frames-1)/self.video_fps + assert end_interval <= video.duration + 0.2, f'{end_interval=}, but {video.duration=}' # 0.2s margin + end_interval = min(end_interval, video.duration) + raw_video, _ = video.get_clip(start_interval, end_interval, sample_frames) + h, w, _ = raw_video[0].shape + h_div_w = h / w + h_div_w_template = self.h_div_w_templates[np.argmin(np.abs((h_div_w-self.h_div_w_templates)))] + tgt_h, tgt_w = self.dynamic_resolution_h_w[h_div_w_template][self.pn]['pixel'] + + for addition_pn in self.addition_pn_list: + pn_list = pn_list + [addition_pn] + for pn in pn_list: + if isinstance(video, EncodedVideoDecord): + img_T3HW = [transform(Image.fromarray(frame).convert("RGB"), tgt_h, tgt_w) for frame in raw_video] + else: + img_T3HW = [transform(Image.fromarray(frame[:,:,::-1]), tgt_h, tgt_w) for frame in raw_video] + img_T3HW = torch.stack(img_T3HW, 0) + img_T3HW_4additional_pn[pn] = img_T3HW + del video + assert img_T3HW.shape[1] == 3 + data_item = { + 'text_input': text_input, + 'img_T3HW': img_T3HW_4additional_pn.get(self.pn, None), + 'raw_features_cthw': raw_features_cthw, + 'feature_cache_file': feature_cache_file, + 'text_features_lenxdim': text_features_lenxdim, + 'text_feature_cache_file': text_feature_cache_file, + } + for pn in pn_list[1:]: + data_item.update({f'img_T3HW_{pn}': img_T3HW_4additional_pn.get(pn, None)}) + return True, data_item + # except Exception as e: + # # print(f'prepare_video_input error: {e}, info: {info}') + # return False, None + # finally: + # try: + # if (img_T3HW is not None) and local_path and (local_path != filename): + # os.remove(local_path) + # except Exception as e: + # print(f'delete local_path: {local_path} error: {e}, info: {info}') + + @staticmethod + def collate_function(batch, online_t5: bool = False) -> None: + pass + + def random_drop_sentences(self, caption): + elems = [item for item in caption.split('.') if item] + if len(elems) < 2: + return caption + sentences = self.epoch_global_worker_generator.integers(1, len(elems)+1) + return '.'.join(elems[:sentences]) + '.' + + def get_text_input(self, long_text_input, short_text_input, long_text_type): + assert long_text_input or short_text_input + if not long_text_input: + return short_text_input + if not short_text_input: + return long_text_input + random_value = self.epoch_global_worker_generator.random() + assert not self.enable_dynamic_length_prompt + if self.enable_dynamic_length_prompt and long_text_type != 'user_prompt': + long_text_elems = [item for item in long_text_input.split('.') if item] + if len(long_text_elems): + first_sentence_words = [item for item in long_text_elems[0].split(' ') if item] + else: + first_sentence_words = 0 + if len(first_sentence_words) >= 15: + num_sentence4short_text = 1 + else: + num_sentence4short_text = 2 + if not short_text_input: + short_text_input = '.'.join(long_text_elems[:num_sentence4short_text]) + if random_value < self.short_prob: + return short_text_input + if len(long_text_elems) <= num_sentence4short_text: + return long_text_input + select_sentence_num = self.epoch_global_worker_generator.integers(num_sentence4short_text+1, len(long_text_elems)+1) + return '.'.join(long_text_elems[:select_sentence_num]) + else: + if random_value < self.short_prob: + return short_text_input + return long_text_input + + def __len__(self): + return self.batch_nums + + def get_image_cache_file(self, image_path): + elems = image_path.split('/') + elems = [item for item in elems if item] + filename, ext = osp.splitext(elems[-1]) + filename = get_prompt_id(filename) + save_filepath = osp.join(self.token_cache_dir, f'images_pn_{self.pn}', '/'.join(elems[4:-1]), f'{filename}.pt') + return save_filepath + + def get_video_cache_file(self, video_path, begin_frame_id, end_frame_id, video_fps): + elems = video_path.split('/') + elems = [item for item in elems if item] + filename, ext = osp.splitext(elems[-1]) + filename = get_prompt_id(filename) + save_filepath = osp.join(self.token_cache_dir, f'pn_{self.pn}_sample_fps_{video_fps}', '/'.join(elems[4:-1]), f'{filename}_sf_{begin_frame_id}_ef_{end_frame_id}.pt') + return save_filepath + +if __name__ == '__main__': + pass diff --git a/Meissonic/InfinityStar/infinity/models/__init__.py b/Meissonic/InfinityStar/infinity/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b351ccd11354fce6ee441d95254da1759a2e19e3 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import torch +from timm.loss import SoftTargetCrossEntropy + +from timm.models.layers import DropPath + +from .infinity import Infinity, sample_with_top_k_top_p_also_inplace_modifying_logits_ + +def _ex_repr(self): + return ', '.join( + f'{k}=' + (f'{v:g}' if isinstance(v, float) else str(v)) + for k, v in vars(self).items() + if not k.startswith('_') and k != 'training' + and not isinstance(v, (torch.nn.Module, torch.Tensor)) + ) +for clz in (torch.nn.CrossEntropyLoss, SoftTargetCrossEntropy): # no longer __repr__ DropPath with drop_prob + if hasattr(clz, 'extra_repr'): + clz.extra_repr = _ex_repr + else: + clz.__repr__ = lambda self: f'{type(self).__name__}({_ex_repr(self)})' + +DropPath.__repr__ = lambda self: f'{type(self).__name__}(...)' + +alias_dict = {} +for d in range(6, 40+2, 2): + alias_dict[f'd{d}'] = f'infinity_d{d}' +alias_dict_inv = {v: k for k, v in alias_dict.items()} diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d9f52b9a2487b577a36cd6dc9ac1d779e306020 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/apg.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/apg.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6d331077665c60a04da4c7bee5712e34b4d3c26 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/apg.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/basic.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/basic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4d7ddf4656801502439181b6685396dab6e34a9 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/basic.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/flex_attn_mask.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/flex_attn_mask.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0509bec29cf4056b6858e7906b4e0324d714ae08 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/flex_attn_mask.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/fused_op.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/fused_op.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3019c503e2d4d1eea3b16cc11279b77efc46bff2 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/fused_op.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/infinity.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/infinity.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a54c4cade7e383cc3fa21ed684763b6cb4f2a210 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/infinity.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/__pycache__/rope.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/__pycache__/rope.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04e371c21540e097ab7b0e860645662eaec15f7f Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/__pycache__/rope.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/apg.py b/Meissonic/InfinityStar/infinity/models/apg.py new file mode 100644 index 0000000000000000000000000000000000000000..bc11c621e1aa94041b196fd7a01e679667c2f540 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/apg.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import math +import torch + + +def project( + v0: torch.Tensor, # [B, seq_len, dim] + v1: torch.Tensor, # [B, seq_len, dim] +): + dtype = v0.dtype + v0, v1 = v0.double(), v1.double() + v1 = torch.nn.functional.normalize(v1, dim=[-1,-2]) + v0_parallel = (v0 * v1).sum(dim=[-1,-2], keepdim=True) * v1 + v0_orthogonal = v0 - v0_parallel + return v0_parallel.to(dtype), v0_orthogonal.to(dtype) + + +def normalized_guidance( + pred_cond: torch. Tensor, # [B, seq_len, dim] + pred_uncond: torch.Tensor, # [B, seq_len, dim] + guidance_scale: float, + momentum_buffer: None, + eta: float = 1.0, + norm_threshold: float = 0.0, +): + B, seq_len, dim = pred_cond.shape + diff = pred_cond - pred_uncond + if norm_threshold > 0: + ones = torch.ones_like(diff) + diff_norm = 1/math.sqrt(seq_len*dim) * diff.norm(p=2, dim=[-1, -2], keepdim=True) + scale_factor = torch.minimum(ones, norm_threshold / diff_norm) + diff = diff * scale_factor + diff_parallel, diff_orthogonal = project(diff, pred_cond) + normalized_update = diff_orthogonal + eta * diff_parallel + pred_guided = pred_cond + (guidance_scale - 1) * normalized_update + return pred_guided \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/basic.py b/Meissonic/InfinityStar/infinity/models/basic.py new file mode 100644 index 0000000000000000000000000000000000000000..42392306bb032c72e28da84f2f87c5fdccb95744 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/basic.py @@ -0,0 +1,392 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Definitions of blocks of VAR transformer model. +""" + +import math +import os +from functools import partial +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +from infinity.models.rope import apply_rotary_emb +from infinity.utils.sequence_parallel import sp_all_to_all, SequenceParallelManager as sp_manager + +# Import flash_attn's fused ops +try: + from flash_attn.ops.rms_norm import rms_norm as rms_norm_impl + from flash_attn.ops.fused_dense import fused_mlp_func + flash_fused_op_installed = True +except ImportError: + fused_mlp_func = None + flash_fused_op_installed = False + + def rms_norm_impl(x, weight, epsilon): + return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(epsilon))) * weight + + +class FastRMSNorm(nn.Module): + def __init__(self, C, eps=1e-6, elementwise_affine=True): + super().__init__() + self.C = C + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = nn.Parameter(torch.ones(C)) + else: + self.register_buffer('weight', torch.ones(C)) + + def forward(self, x): + src_type = x.dtype + return rms_norm_impl(x.float(), self.weight, epsilon=self.eps).to(src_type) + + def extra_repr(self) -> str: + return f'C={self.C}, eps={self.eps:g}, elementwise_affine={self.elementwise_affine}' + + +def get_dropout_layer(p): + return nn.Dropout(p, inplace=True) if p > 0 else nn.Identity() + + +class FFN(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, drop=0., fused_mlp=False): + super().__init__() + self.fused_mlp_func = fused_mlp_func if fused_mlp else None + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = nn.GELU(approximate='tanh') + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = get_dropout_layer(drop) + self.heuristic = -1 + + def forward(self, x): + if self.fused_mlp_func is not None: + return self.drop(self.fused_mlp_func( + x=x, + weight1=self.fc1.weight, + weight2=self.fc2.weight, + bias1=self.fc1.bias, + bias2=self.fc2.bias, + activation='gelu_approx', + save_pre_act=self.training, + return_residual=False, + checkpoint_lvl=0, + heuristic=self.heuristic, + process_group=None, + )) + else: + return self.drop(self.fc2(self.act(self.fc1(x)))) + + def extra_repr(self) -> str: + return f'fused_mlp={self.fused_mlp_func is not None}' + +class Qwen3MLP(nn.Module): + def __init__(self, hidden_size, intermediate_size): + super().__init__() + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = nn.SiLU() + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + +class FFNSwiGLU(nn.Module): + def __init__(self, in_features, hidden_features, out_features=None, drop=0., fused_mlp=False): + super().__init__() + self.fused_mlp_func = None + hidden_features = round(2 * hidden_features / 3 / 256) * 256 + + out_features = out_features or in_features + self.fcg = nn.Linear(in_features, hidden_features, bias=False) + self.fc1 = nn.Linear(in_features, hidden_features, bias=False) + self.fc2 = nn.Linear(hidden_features, out_features, bias=False) + self.drop = get_dropout_layer(drop) + + def forward(self, x): + return self.drop(self.fc2( F.silu(self.fcg(x), inplace=True).mul_(self.fc1(x)) )) + + def extra_repr(self) -> str: + return f'fused_mlp={self.fused_mlp_func is not None}' + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + +class SelfAttention(nn.Module): + def __init__( + self, embed_dim=768, num_heads=12, num_key_value_heads=-1, + use_flex_attn=False, + pad_to_multiplier=1, rope2d_normalized_by_hw=0, + mask_type='var', context_frames=1000000, steps_per_frame=4, + arch='var', + qwen_qkvo_bias=False, + ): + """ + :param embed_dim: model's width + :param num_heads: num heads of multi-head attention + """ + super().__init__() + assert embed_dim % num_heads == 0 + assert num_key_value_heads == -1 or num_heads % num_key_value_heads == 0 + + self.embed_dim = embed_dim + self.num_heads, self.head_dim = num_heads, embed_dim // num_heads + self.num_key_value_heads = num_key_value_heads if num_key_value_heads > 0 else num_heads + self.arch = arch + if self.arch == 'qwen': + self.q_proj = nn.Linear(embed_dim, self.num_heads*self.head_dim, bias=qwen_qkvo_bias) + self.k_proj = nn.Linear(embed_dim, self.num_key_value_heads*self.head_dim, bias=qwen_qkvo_bias) + self.v_proj = nn.Linear(embed_dim, self.num_key_value_heads*self.head_dim, bias=qwen_qkvo_bias) + self.o_proj = nn.Linear(self.num_heads*self.head_dim, embed_dim, bias=qwen_qkvo_bias) + self.q_norm = FastRMSNorm(self.head_dim) + self.k_norm = FastRMSNorm(self.head_dim) + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + else: + raise ValueError(f'arch {self.arch} not supported') + + self.caching = False # kv caching: only used during inference + self.cached_k = {} # kv caching: only used during inference + self.cached_v = {} # kv caching: only used during inference + + self.use_flex_attn = use_flex_attn + self.pad_to_multiplier = pad_to_multiplier + + self.rope2d_normalized_by_hw = rope2d_normalized_by_hw + self.mask_type = mask_type + self.context_frames = context_frames + self.steps_per_frame = steps_per_frame + + def kv_caching(self, enable: bool): # kv caching: only used during inference + self.caching = enable + self.cached_k = {} + self.cached_v = {} + + # NOTE: attn_bias_or_two_vector is None during inference + def forward(self, x, attn_bias_or_two_vector: Union[torch.Tensor, Tuple[torch.IntTensor, torch.IntTensor]], attn_fn=None, rope2d_freqs_grid=[], scale_schedule=[], scale_ind=0, context_info=None, last_repetition_step=True, ref_text_scale_inds=[]): + """ + :param (fp32) x: shaped (B or batch_size, L or seq_length, C or hidden_dim); if seq-parallel is used, the `L` dim would be sharded (L = raw_seq_len//sp_size) + :param (fp32) attn_bias_or_two_vector: + if not using_flash: + a block-wise, lower-triangle matrix, like: + [[[[0, -, -, -, -, -, -, -, -, -, -, -, -, -], + [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -], + [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -], + [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -], + [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]] + where 0 means visible and - means invisible (-inf) + else: + a tuple of two 1-dim int vector (VAR_visible_kvlen, VAR_invisible_qlen) + :return: shaped (B or batch_size, L or seq_length, C or hidden_dim); if seq-parallel is used, the `L` dim would be sharded + """ + # x: fp32 + B, L, C = x.shape + + if self.arch == 'qwen': + hidden_states = x + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) # batch, num_key_value_heads, slen, head_dim + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) # batch, num_key_value_heads, slen, head_dim + + if sp_manager.sp_on(): + # Headnum need to be sharded and L needs to be gathered + # [B, H, raw_L/sp, C] --> [B, H/sp, raw_L, C] + sdim = 1 + gdim = 2 + L = L * sp_manager.get_sp_size() + C = C // sp_manager.get_sp_size() + query_states = sp_all_to_all(query_states, sdim, gdim) + key_states = sp_all_to_all(key_states, sdim, gdim) + value_states = sp_all_to_all(value_states, sdim, gdim) + + query_states, key_states = apply_rotary_emb(query_states, key_states, rope2d_freqs_grid) + if self.caching: # kv caching: only used during inference + if last_repetition_step: + self.cached_k[scale_ind] = key_states + self.cached_v[scale_ind] = value_states + if isinstance(scale_ind, int): + ref_scale_inds = context_info[scale_ind]['ref_sids'] + ref_text_scale_inds + key_states = torch.cat([self.cached_k[ind] for ind in ref_scale_inds] + [key_states], dim=2) + value_states = torch.cat([self.cached_v[ind] for ind in ref_scale_inds] + [value_states], dim=2) + + ref_scale_2_last_use_scale = [-1 for _ in range(len(context_info))] + for si in range(len(context_info)): + for ref_si in context_info[si]['ref_sids']: + ref_scale_2_last_use_scale[ref_si] = si + for ref_si in range(scale_ind): + if (ref_scale_2_last_use_scale[ref_si] < scale_ind) and (self.cached_k[ref_si] is not None): + tmpk, tmpv = self.cached_k[ref_si], self.cached_v[ref_si] + self.cached_k[ref_si], self.cached_v[ref_si] = None, None + del tmpk, tmpv + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + scale = self.head_dim**-0.5 + if self.use_flex_attn and attn_fn is not None: + attn_output = attn_fn(query_states.to(value_states.dtype), key_states.to(value_states.dtype), value_states, scale=scale).transpose(1, 2).reshape(B, L, C) + else: + # fa2, flash_attn_func input/output should be (batch_size, seqlen, nheads, headdim) + from flash_attn import flash_attn_qkvpacked_func, flash_attn_func + attn_output = flash_attn_func(query_states.permute([0,2,1,3]).to(torch.bfloat16), key_states.permute([0,2,1,3]).to(torch.bfloat16), value_states.permute([0,2,1,3]).to(torch.bfloat16), softmax_scale=scale) + attn_output = attn_output.reshape(B, L, C) + + # fa3, flash_attn_func input/output should be (batch_size, seqlen, nheads, headdim) + # from flash_attn_interface import flash_attn_qkvpacked_func, flash_attn_func + # attn_output = flash_attn_func(query_states.permute([0,2,1,3]).to(torch.bfloat16), key_states.permute([0,2,1,3]).to(torch.bfloat16), value_states.permute([0,2,1,3]).to(torch.bfloat16), softmax_scale=scale) + # attn_output = attn_output[0].reshape(B, L, C) + + # slow attn + # attn_output = slow_attn(query=query_states, key=key_states, value=value_states, scale=scale, attn_mask=attn_bias_or_two_vector, dropout_p=0).transpose(1, 2).reshape(B, L, C) + if sp_manager.sp_on(): + # [B, raw_L, C/sp] --> [B, raw_L/sp, C] + sdim = 1 + gdim = 2 + attn_output = sp_all_to_all(attn_output, sdim, gdim) + + attn_output = self.o_proj(attn_output) + + return attn_output + + # qkv: amp, bf16 + qkv = F.linear(input=x, weight=self.mat_qkv.weight, bias=torch.cat((self.q_bias, self.zero_k_bias, self.v_bias))).view(B, L, 3, self.num_heads, self.head_dim) # BL3Hc + q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0); L_dim = 2 # q or k or v: all are shaped in (B:batch_size, H:heads, L:seq_len, c:head_dim), this way + + scale_mul = self.scale_mul_1H11.clamp_max(self.max_scale_mul).exp() # 11H1 (flash), or 1H11 (not flash) + q = F.normalize(q, dim=-1, eps=1e-12).mul(scale_mul).contiguous() # fp32 + k = F.normalize(k, dim=-1, eps=1e-12).contiguous() # fp32 + v = v.contiguous() # bf16 + + if sp_manager.sp_on(): + # Headnum need to be sharded and L needs to be gathered + # [B, H, raw_L/sp, C] --> [B, H/sp, raw_L, C] + sdim = 1 + gdim = 2 + + L = L * sp_manager.get_sp_size() + C = C // sp_manager.get_sp_size() + + q = sp_all_to_all(q, sdim, gdim) + k = sp_all_to_all(k, sdim, gdim) + v = sp_all_to_all(v, sdim, gdim) + + + q, k = apply_rotary_emb(q, k, rope2d_freqs_grid) #, freqs_cis=freqs_cis) + if self.caching: # kv caching: only used during inference + if last_repetition_step: + self.cached_k.append(k) + self.cached_v.append(v) + if scale_ind >= 0: + ref_scale_inds = context_info[scale_ind]['ref_sids'] + k = torch.cat([self.cached_k[0]] + [self.cached_k[ind+1] for ind in ref_scale_inds] + [k], dim=L_dim) + v = torch.cat([self.cached_v[0]] + [self.cached_v[ind+1] for ind in ref_scale_inds] + [v], dim=L_dim) + + ref_scale_2_last_use_scale = [-1 for _ in range(len(context_info))] + for si in range(len(context_info)): + for ref_si in context_info[si]['ref_sids']: + ref_scale_2_last_use_scale[ref_si] = si + for ref_si in range(scale_ind): + if (ref_scale_2_last_use_scale[ref_si] < scale_ind) and (self.cached_k[ref_si+1] is not None): + tmpk, tmpv = self.cached_k[ref_si+1], self.cached_v[ref_si+1] + self.cached_k[ref_si+1], self.cached_v[ref_si+1] = None, None + del tmpk, tmpv + + # if self.cos_attn: q, k are in fp32; v is in bf16 + # else: q, k, v are in bf16 + if self.use_flex_attn and attn_fn is not None: + oup = attn_fn(q.to(v.dtype), k.to(v.dtype), v, scale=self.scale).transpose(1, 2).reshape(B, L, C) + else: + # oup = slow_attn(query=q, key=k, value=v, scale=self.scale, attn_mask=attn_bias_or_two_vector, dropout_p=0).transpose(1, 2).reshape(B, L, C) + # fa2, flash_attn_func input/output should be (batch_size, seqlen, nheads, headdim) + from flash_attn import flash_attn_qkvpacked_func, flash_attn_func + oup = flash_attn_func(q.permute([0,2,1,3]).to(torch.bfloat16), k.permute([0,2,1,3]).to(torch.bfloat16), v.permute([0,2,1,3]).to(torch.bfloat16), softmax_scale=self.scale) + oup = oup.reshape(B, L, C) + # oup: bf16 + + if sp_manager.sp_on(): + # [B, raw_L, C/sp] --> [B, raw_L/sp, C] + sdim = 1 + gdim = 2 + oup = sp_all_to_all(oup, sdim, gdim) + + return self.proj_drop(self.proj(oup)) + +class SelfAttnBlock(nn.Module): + def __init__( + self, + embed_dim, + cond_dim, + num_heads, + num_key_value_heads, + mlp_ratio=4.0, + use_flex_attn=False, + pad_to_multiplier=1, + rope2d_normalized_by_hw=False, + mask_type="", + context_frames=-1, + steps_per_frame=-1, + arch="var", + qwen_qkvo_bias=False, + inject_sync=False, + ): + super(SelfAttnBlock, self).__init__() + self.C, self.D = embed_dim, cond_dim + self.arch=arch + self.attn = SelfAttention( + embed_dim=embed_dim, num_heads=num_heads, num_key_value_heads=num_key_value_heads, + use_flex_attn=use_flex_attn, pad_to_multiplier=pad_to_multiplier, rope2d_normalized_by_hw=rope2d_normalized_by_hw, + mask_type=mask_type, context_frames=context_frames, steps_per_frame=steps_per_frame, arch=arch, qwen_qkvo_bias=qwen_qkvo_bias, + ) + if self.arch == 'qwen': + self.mlp = Qwen3MLP(hidden_size=embed_dim, intermediate_size=round(embed_dim * mlp_ratio / 256) * 256) + self.input_layernorm = FastRMSNorm(embed_dim) + self.post_attention_layernorm = FastRMSNorm(embed_dim) + self.inject_sync = inject_sync + else: + raise ValueError(f'arch {self.arch} not supported') + + # NOTE: attn_bias_or_two_vector is None during inference + def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn=None, rope2d_freqs_grid=[], scale_schedule=[], scale_ind=0, context_info=None, last_repetition_step=True, ref_text_scale_inds=[]): + residual = x + hidden_states = x + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.attn(hidden_states, attn_bias_or_two_vector, attn_fn, rope2d_freqs_grid, scale_schedule, scale_ind, context_info, last_repetition_step, ref_text_scale_inds) + hidden_states = residual + hidden_states + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +if __name__ == '__main__': + pass diff --git a/Meissonic/InfinityStar/infinity/models/ema.py b/Meissonic/InfinityStar/infinity/models/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..95ac3addf0ef51a3db52746a27d4b350f277ec7f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/ema.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import copy +import torch +from collections import OrderedDict + + +def get_ema_model(model): + ema_model = copy.deepcopy(model) + ema_model.eval() + for param in ema_model.parameters(): + param.requires_grad = False + return ema_model + +@torch.no_grad() +def update_ema(ema_model, model, decay=0.9999): + """ + Step the EMA model towards the current model. + """ + ema_params = OrderedDict(ema_model.named_parameters()) + model_params = OrderedDict(model.named_parameters()) + + for name, param in model_params.items(): + # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed + ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay) diff --git a/Meissonic/InfinityStar/infinity/models/flex_attn_mask.py b/Meissonic/InfinityStar/infinity/models/flex_attn_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..e2a1558b092b1f2225e8433a955f845542787829 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/flex_attn_mask.py @@ -0,0 +1,129 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from functools import partial +import torch +import numpy as np +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.attention.flex_attention import flex_attention, create_block_mask + +from infinity.schedules.dynamic_resolution import get_full_spatial_size_scale_indices, get_first_full_spatial_size_scale_index + + +def _length_to_offsets(lengths, device): + offsets = [0] + offsets.extend(lengths) + offsets = torch.tensor(offsets, device=device, dtype=torch.int32) + offsets = torch.cumsum(offsets, dim=-1) + return offsets + +def _offsets_to_doc_ids_tensor(offsets): + device = offsets.device + counts = offsets[1:] - offsets[:-1] + visual = torch.repeat_interleave(torch.arange(len(counts), device=device, dtype=torch.int32), counts) + return visual + +def _generate_video_tower_mask(offsets, context_frames, full_resolution_scales, prefix_lens): + document_id = _offsets_to_doc_ids_tensor(offsets) + visual_tokens = offsets[-2] + def _mask_prefix_valid(b, h, q_idx, kv_idx): + text_token_ends = visual_tokens + prefix_lens[b] + return (q_idx >= visual_tokens) & (q_idx < text_token_ends) & (kv_idx >= visual_tokens) & (kv_idx < text_token_ends) + def _mask_visual(b, h, q_idx, kv_idx): + text_token_ends = visual_tokens + prefix_lens[b] + return (q_idx < visual_tokens) & ( + (document_id[q_idx] == document_id[kv_idx]) | + ((kv_idx >= visual_tokens) & (kv_idx < text_token_ends)) | + ( + (document_id[q_idx] > document_id[kv_idx]) & (document_id[q_idx] - document_id[kv_idx] < context_frames) & (document_id[kv_idx] in full_resolution_scales) + ) + ) + def video_tower_mask(b, h, q_idx, kv_idx): + mask_prefix_valid = _mask_prefix_valid(b, h, q_idx, kv_idx) + mask_visual = _mask_visual(b, h, q_idx, kv_idx) + return mask_prefix_valid | mask_visual + return video_tower_mask + +def _generate_two_pyramid_mask(offsets, first_full_spatial_size_scale_index, prefix_lens): + document_id = _offsets_to_doc_ids_tensor(offsets) + visual_tokens = offsets[-2] + def _mask_prefix_valid(b, h, q_idx, kv_idx): + text_token_ends = visual_tokens + prefix_lens[b] + return (q_idx >= visual_tokens) & (q_idx < text_token_ends) & (kv_idx >= visual_tokens) & (kv_idx < text_token_ends) + def _mask_visual(b, h, q_idx, kv_idx): + text_token_ends = visual_tokens + prefix_lens[b] + return (q_idx < visual_tokens) & ( + (document_id[q_idx] == document_id[kv_idx]) | + ((kv_idx >= visual_tokens) & (kv_idx < text_token_ends)) | + (document_id[q_idx] > document_id[kv_idx]) & (document_id[kv_idx] == first_full_spatial_size_scale_index) + ) + def video_two_pyramid_mask(b, h, q_idx, kv_idx): + mask_prefix_valid = _mask_prefix_valid(b, h, q_idx, kv_idx) + mask_visual = _mask_visual(b, h, q_idx, kv_idx) + return mask_prefix_valid | mask_visual + return video_two_pyramid_mask + +def _generate_inner_scale_only_mask(offsets, prefix_lens): + document_id = _offsets_to_doc_ids_tensor(offsets) + visual_tokens = offsets[-2] + def _mask_prefix_valid(b, h, q_idx, kv_idx): + text_token_ends = visual_tokens + prefix_lens[b] + return (q_idx >= visual_tokens) & (q_idx < text_token_ends) & (kv_idx >= visual_tokens) & (kv_idx < text_token_ends) + def _mask_visual(b, h, q_idx, kv_idx): + text_token_ends = visual_tokens + prefix_lens[b] + return (q_idx < visual_tokens) & ( + (document_id[q_idx] == document_id[kv_idx]) | + ((kv_idx >= visual_tokens) & (kv_idx < text_token_ends)) + ) + def overall_mask(b, h, q_idx, kv_idx): + mask_prefix_valid = _mask_prefix_valid(b, h, q_idx, kv_idx) + mask_visual = _mask_visual(b, h, q_idx, kv_idx) + return mask_prefix_valid | mask_visual + return overall_mask + +def _generate_infinity_pack(offsets, querysid_refsid): + document_id = _offsets_to_doc_ids_tensor(offsets) # to scale_ind + def overall_mask(b, h, q_idx, kv_idx): + querysid = document_id[q_idx] + kv_sid = document_id[kv_idx] + return querysid_refsid[querysid][kv_sid] + return overall_mask + +def causal(b, h, q_idx, kv_idx): + return q_idx >= kv_idx + +def build_flex_attn_func( + flex_attention, + seq_l, + prefix_lens, + args, + device, + batch_size, + heads, + pad_seq_len, + sequece_packing_scales, + super_scale_lengths, + super_querysid_super_refsid, +): + """ + Build a flex attn function for a given scale schedule. + Args: + flex_attention: compiled flex attention + seq_l: seq length + prefix_lens: valid text prefix lens, [bs] + args: arguments + device: device + batch_size: batch size + heads: heads + pad_seq_len: pad_seq_len + sequece_packing_scales: list of scale schedule + querysid_refsid: list of scale_pack_info + Returns: + attn_fn: flex attn function + """ + assert sum(super_scale_lengths) == seq_l, f'{sum(super_scale_lengths)}!= {seq_l}' + offsets = _length_to_offsets(super_scale_lengths, device=device) + mask_mod = _generate_infinity_pack(offsets, super_querysid_super_refsid) + block_mask = create_block_mask(mask_mod, B = batch_size, H = heads, Q_LEN = seq_l, KV_LEN = seq_l, device = device, _compile = True) + attn_fn = partial(flex_attention, block_mask=block_mask) + return attn_fn diff --git a/Meissonic/InfinityStar/infinity/models/fused_op.py b/Meissonic/InfinityStar/infinity/models/fused_op.py new file mode 100644 index 0000000000000000000000000000000000000000..3e76ec95d375a09b78bac90a2126e0c2a5bcb6da --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/fused_op.py @@ -0,0 +1,30 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import gc +from copy import deepcopy +from typing import Union + +import torch +from torch import nn as nn +from torch.nn import functional as F + + +@torch.compile(fullgraph=True) +def fused_rms_norm(x: torch.Tensor, weight: nn.Parameter, eps: float): + x = x.float() + return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(eps))) * weight + + +@torch.compile(fullgraph=True) +def fused_ada_layer_norm(C: int, eps: float, x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor): + x = x.float() + x = F.layer_norm(input=x, normalized_shape=(C,), weight=None, bias=None, eps=eps) + return x.mul(scale.add(1)).add_(shift) + + +@torch.compile(fullgraph=True) +def fused_ada_rms_norm(C: int, eps: float, x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor): + x = x.float() + x = (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(eps))) + return x.mul(scale.add(1)).add_(shift) diff --git a/Meissonic/InfinityStar/infinity/models/infinity.py b/Meissonic/InfinityStar/infinity/models/infinity.py new file mode 100644 index 0000000000000000000000000000000000000000..16abdde49d9711f4fbdee19f04bd994210a3c871 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/infinity.py @@ -0,0 +1,1282 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Definition of Infinity transformer model. +""" + +import math +import random +import time +from contextlib import nullcontext +from functools import partial +from typing import List, Optional, Tuple, Union, Dict, Any +import json + +import tqdm +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models import register_model +from torch.utils.checkpoint import checkpoint +import numpy as np +from torch.nn.attention.flex_attention import flex_attention + +import infinity.utils.dist as dist +from infinity.utils.dist import for_visualize +from infinity.models.basic import flash_fused_op_installed, SelfAttnBlock, FastRMSNorm +from infinity.models.rope import precompute_rope4d_freqs_grid +from infinity.models.flex_attn_mask import build_flex_attn_func +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta, get_first_full_spatial_size_scale_index, get_activated_h_div_w_templates +from infinity.models.apg import normalized_guidance +from infinity.utils.sequence_parallel import sp_split_sequence_by_dim, sp_gather_sequence_by_dim, SequenceParallelManager as sp_manager + +try: + from infinity.models.fused_op import fused_ada_layer_norm, fused_ada_rms_norm +except: + fused_ada_layer_norm, fused_ada_rms_norm = None, None + + +class MultiInpIdentity(nn.Module): + def forward(self, x, *args, **kwargs): + return x + +class SharedAdaLin(nn.Linear): + def forward(self, cond_BD): + C = self.weight.shape[0] // 6 + return super().forward(cond_BD).reshape(-1, 1, 6, C) # B16C + +class MultipleLayers(nn.Module): + def __init__(self, ls, num_blocks_in_a_chunk, index): + super().__init__() + self.module = nn.ModuleList() + for i in range(index, index+num_blocks_in_a_chunk): + self.module.append(ls[i]) + + def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn=None, scale_schedule=None, checkpointing_full_block=False, rope2d_freqs_grid=None, scale_ind=None, context_info=None, last_repetition_step=True, ref_text_scale_inds=[]): + h = x + for m in self.module: + if checkpointing_full_block: + h = torch.utils.checkpoint.checkpoint(m, h, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn, rope2d_freqs_grid, scale_schedule, scale_ind, context_info, last_repetition_step, ref_text_scale_inds, use_reentrant=False) + else: + h = m(h, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn, rope2d_freqs_grid, scale_schedule, scale_ind, context_info, last_repetition_step, ref_text_scale_inds) + return h + +def get_timestep_embedding(dim, timesteps=1000, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + assert dim % 2 == 0, "dimension must be even number" + half = dim // 2 + timesteps = torch.arange(timesteps, dtype=torch.float32) + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + return embedding + +class Infinity(nn.Module): + def __init__( + self, vae_local, + arch='qwen', # var or qwen + qwen_qkvo_bias=False, # qwen qwen_qkvo_bias + text_channels=0, text_maxlen=0, # text-cond generation + embed_dim=1024, depth=16, + num_key_value_heads=-1, + num_heads=16, mlp_ratio=4., # model's architecture + norm_eps=1e-6, rms_norm=False, # norm layer + cond_drop_rate=0.1, # for classifier-free guidance + rand_uncond=False, + drop_path_rate=0.1, + raw_scale_schedule=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16), + top_p=0.0, + top_k=0.0, + block_chunks=1, + checkpointing=None, + pad_to_multiplier=0, + use_flex_attn=False, + add_lvl_embeding_on_first_block=1, + num_of_label_value=2, + rope2d_each_sa_layer=0, + rope2d_normalized_by_hw=0, + pn=None, + train_h_div_w_list=None, + video_frames=1, + apply_spatial_patchify = 0, + inference_mode=False, + other_args=None, + ): + super().__init__() + # set hyperparameters + self.C = embed_dim + self.vae_embed_dim = vae_local.codebook_dim + self.detail_scale_min_tokens = other_args.detail_scale_min_tokens + self.inference_mode = inference_mode + self.apply_spatial_patchify = apply_spatial_patchify + if self.apply_spatial_patchify: + self.d_vae = vae_local.codebook_dim * 4 + else: + self.d_vae = vae_local.codebook_dim + self.other_args = other_args + self.mask_type = other_args.mask_type + self.context_frames = other_args.context_frames + self.dynamic_resolution_h_w, self.h_div_w_templates = get_dynamic_resolution_meta(other_args.dynamic_scale_schedule, other_args.video_frames) + self.num_of_label_value = num_of_label_value + self.codebook_dim = self.d_vae + self.V = (self.codebook_dim * self.num_of_label_value) if self.num_of_label_value else vae_local.vocab_size + self.Ct5 = text_channels + self.depth = depth + self.num_heads = num_heads + self.image_batch_size = other_args.image_batch_size + self.video_batch_size = other_args.video_batch_size + self.arch = arch + self.mlp_ratio = mlp_ratio + self.cond_drop_rate = cond_drop_rate + self.norm_eps = norm_eps + self.prog_si = -1 + self.pn = pn + self.train_h_div_w_list = get_activated_h_div_w_templates(train_h_div_w_list, self.h_div_w_templates) + self.video_frames = video_frames + + + assert add_lvl_embeding_on_first_block in [0,1] + self.add_lvl_embeding_on_first_block = add_lvl_embeding_on_first_block + assert rope2d_each_sa_layer in [0,1] + self.rope2d_each_sa_layer = rope2d_each_sa_layer + self.rope2d_normalized_by_hw = rope2d_normalized_by_hw + self.image_scale_repetition = json.loads(other_args.image_scale_repetition) + self.video_scale_repetition = json.loads(other_args.video_scale_repetition) + print(f'arch: {arch}, self.pn: {self.pn}, self.codebook_dim: {self.codebook_dim}, self.add_lvl_embeding_on_first_block: {self.add_lvl_embeding_on_first_block}, \ + self.num_of_label_value: {self.num_of_label_value}, self.rope2d_each_sa_layer: {rope2d_each_sa_layer}, self.rope2d_normalized_by_hw: {self.rope2d_normalized_by_hw} \ + self.train_h_div_w_list: {self.train_h_div_w_list}, self.image_scale_repetition: {self.image_scale_repetition}, self.video_scale_repetition: {self.video_scale_repetition}') + head_up_method = '' + word_patch_size = 1 if head_up_method in {'', 'no'} else 2 + if word_patch_size > 1: + assert all(raw_pn % word_patch_size == 0 for raw_pn in raw_scale_schedule), f'raw_scale_schedule={raw_scale_schedule}, not compatible with word_patch_size={word_patch_size}' + + self.checkpointing = checkpointing + self.pad_to_multiplier = max(1, pad_to_multiplier) + + self.raw_scale_schedule = raw_scale_schedule # 'raw' means before any patchifying + # solve top-p top-k sampling hyperparameters + self.top_p, self.top_k = max(min(top_p, 1), 0), (round(top_k * self.V) if 0 < top_k < 1 else round(top_k)) + if self.top_p < 1e-5: self.top_p = 0 + if self.top_k >= self.V or self.top_k <= 0: self.top_k = 0 + + t = torch.zeros(dist.get_world_size(), device=dist.get_device()) + t[dist.get_rank()] = float(flash_fused_op_installed) + dist.barrier() + dist.allreduce(t) + assert round(t.sum().item()) in {0, dist.get_world_size()}, f'flash_fused_op_installed: {t}' + + self.rng = torch.Generator(device=dist.get_device()) + self.maybe_record_function = nullcontext + self.text_maxlen = text_maxlen + self.t2i = text_channels != 0 + + # [inp & position embedding] + self.norm0_cond = nn.Identity() + self.selecting_idx = None + self.num_classes = 0 + self.D = self.C + + cfg_uncond = torch.empty(512, self.Ct5) + rng = torch.Generator(device='cpu') + rng.manual_seed(0) + torch.nn.init.trunc_normal_(cfg_uncond, std=1.2, generator=rng) + cfg_uncond /= self.Ct5 ** 0.5 + if rand_uncond: + self.register_buffer('cfg_uncond', cfg_uncond) + else: + self.cfg_uncond = nn.Parameter(cfg_uncond) + + if other_args.simple_text_proj: + self.text_norm = nn.Identity() + self.text_proj = nn.Linear(self.Ct5, self.D) + else: + self.text_norm = FastRMSNorm(self.Ct5, elementwise_affine=True, eps=norm_eps) + self.text_proj = nn.Sequential( + nn.Linear(self.Ct5, self.D), + nn.GELU(approximate='tanh'), + nn.Linear(self.D, self.D), + ) + self.sos_token = nn.Parameter(torch.empty(1, 1, self.D)) + + if self.rope2d_each_sa_layer: + if other_args.rope_type == '4d': + tmp_h_div_w_template = self.train_h_div_w_list[0] + scales_in_one_clip = self.dynamic_resolution_h_w[tmp_h_div_w_template][self.pn]['scales_in_one_clip'] + max_video_scales = self.dynamic_resolution_h_w[tmp_h_div_w_template][self.pn]['max_video_scales'] + if other_args.dynamic_scale_schedule == 'infinity_star_interact': + max_scales = 1000 + else: + max_scales = sum(self.image_scale_repetition) + sum(self.video_scale_repetition) * (max_video_scales//scales_in_one_clip-1) + max_scales = max(max_scales, max_video_scales) + rope2d_freqs_grid = precompute_rope4d_freqs_grid(dim=self.C//self.num_heads, + pad_to_multiplier=self.pad_to_multiplier, rope2d_normalized_by_hw=self.rope2d_normalized_by_hw, + activated_h_div_w_templates=self.train_h_div_w_list, + steps_per_frame=other_args.steps_per_frame, + max_scales=max_scales+10, + max_frames=int(self.video_frames/other_args.temporal_compress_rate+1), + max_height=1800 // 8, max_width=1800 // 8, + text_maxlen=self.text_maxlen, + pn=self.pn, + args=other_args,) + else: + raise ValueError(f'self.rope_type == {self.rope_type} unsupported!') + self.rope2d_freqs_grid = rope2d_freqs_grid + else: + raise ValueError(f'self.rope2d_each_sa_layer={self.rope2d_each_sa_layer} not implemented') + + # [input layers] input norm && input embedding + norm_layer = partial(FastRMSNorm if rms_norm else nn.LayerNorm, eps=norm_eps) + self.norm0_ve = nn.Identity() + self.word_embed = nn.Linear(self.d_vae, self.C) + if self.arch == 'qwen': + self.norm_hidden_sates = FastRMSNorm(self.C) + else: + raise ValueError(f'arch={self.arch} not implemented') + + # [backbone and head] + self.use_flex_attn = use_flex_attn + self.attn_fn_compile_dict = {} + if self.use_flex_attn: + self.flex_attention = torch.compile(flex_attention) + + self.unregistered_blocks = [] + for _ in range(depth): + block = SelfAttnBlock( + embed_dim=self.C, + cond_dim=self.D, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=mlp_ratio, + use_flex_attn=use_flex_attn, + pad_to_multiplier=pad_to_multiplier, + rope2d_normalized_by_hw=rope2d_normalized_by_hw, + mask_type=other_args.mask_type, + context_frames=other_args.context_frames, + steps_per_frame=other_args.steps_per_frame, + arch=self.arch, + qwen_qkvo_bias=qwen_qkvo_bias, + inject_sync=other_args.inject_sync, + ) + # block.bfloat16() + self.unregistered_blocks.append(block) + + # [head] + self.head = nn.Linear(self.C, self.other_args.detail_scale_dim*self.other_args.num_of_label_value) + if self.other_args.use_two_stage_lfq: + self.semantic_head2 = nn.Linear(self.C, self.other_args.semantic_scale_dim*self.other_args.num_of_label_value) + + self.num_block_chunks = block_chunks or 1 + self.num_blocks_in_a_chunk = depth // block_chunks + print(f"{self.num_blocks_in_a_chunk=}, {depth=}, {block_chunks=}") + assert self.num_blocks_in_a_chunk * block_chunks == depth + if self.num_block_chunks == 1: + self.blocks = nn.ModuleList(self.unregistered_blocks) + else: + self.block_chunks = nn.ModuleList() + for i in range(self.num_block_chunks): + self.block_chunks.append(MultipleLayers(self.unregistered_blocks, self.num_blocks_in_a_chunk, i*self.num_blocks_in_a_chunk)) + print( + f' [Infinity config ] embed_dim={embed_dim}, num_heads={num_heads}, depth={depth}, mlp_ratio={mlp_ratio}, num_blocks_in_a_chunk={self.num_blocks_in_a_chunk}\n', + end='\n\n', flush=True + ) + + def get_loss_acc(self, x_BLC, sequece_packing_scales, gt): + """ + :param h: hidden_state, shaped (B or batch_size, L or seq_len, C or hidden_dim) + :param cond_BD: shaped (B or batch_size, D or cond_dim) + :param tau: temperature + :return: logits, shaped (B or batch_size, V or vocabulary_size) + """ + if self.arch == 'qwen': + x_BLC = self.norm_hidden_sates(x_BLC) + + with torch.amp.autocast('cuda', enabled=False): + x_BLC = x_BLC.float() + logits_full = self.head(x_BLC) + if self.other_args.use_two_stage_lfq: + logits_semantic_full = self.semantic_head2(x_BLC) + global_token_ptr, global_scale_ptr = 0, 0 + loss_list, acc_list = [], [] + for i in range(len(sequece_packing_scales)): + for j in range(len(sequece_packing_scales[i])): + pt, ph, pw = sequece_packing_scales[i][j] + mul_pt_ph_pw = pt * ph * pw + if ph * pw >= self.detail_scale_min_tokens: + logits = logits_full[:,global_token_ptr:global_token_ptr+mul_pt_ph_pw] + else: + logits = logits_semantic_full[:,global_token_ptr:global_token_ptr+mul_pt_ph_pw] + logits = logits.reshape(x_BLC.shape[0], mul_pt_ph_pw, -1, self.other_args.num_of_label_value) + logits = logits.permute(0,3,1,2) # [1, mul_pt_ph_pw, d, num_of_label_value] -> [1, num_of_label_value, mul_pt_ph_pw, d] + # gt[global_scale_ptr]: [1, mul_pt_ph_pw, d] + loss_this_scale = F.cross_entropy(logits, gt[global_scale_ptr], reduction='none').mean(-1)[0] # [mul_pt_ph_pw] + acc_this_scale = (logits.argmax(1) == gt[global_scale_ptr]).float().mean(-1)[0] # [mul_pt_ph_pw] + + loss_list.append(loss_this_scale) + acc_list.append(acc_this_scale) + global_scale_ptr += 1 + global_token_ptr += mul_pt_ph_pw + loss_list = torch.cat(loss_list) + acc_list = torch.cat(acc_list) + else: + gt = torch.cat(gt, 1) # [B, L, d] + logits = logits_full + logits = logits.reshape(x_BLC.shape[0], x_BLC.shape[1], -1, self.other_args.num_of_label_value) + logits = logits.permute(0,3,1,2) # [B, num_of_label_value, L, d] + if self.other_args.num_of_label_value > 1: + loss_list = F.cross_entropy(logits, gt, reduction='none').mean(-1)[0] # [L] + acc_list = (logits.argmax(1) == gt).float().mean(-1)[0] # [L] + elif self.other_args.num_of_label_value == 1: + loss_list = torch.nn.functional.mse_loss(logits.squeeze(1), gt[global_scale_ptr], reduction='none').mean(-1)[0] # [L] + acc_list = loss_list + return loss_list, acc_list + + def get_logits_during_infer(self, x_BLC, is_semantic_scale): + if self.arch == 'qwen': + x_BLC = self.norm_hidden_sates(x_BLC) + with torch.amp.autocast('cuda', enabled=False): + x_BLC = x_BLC.float() + if self.other_args.use_two_stage_lfq: + if is_semantic_scale: + logits = self.semantic_head2(x_BLC) + else: + logits = self.head(x_BLC) + else: + logits = self.head(x_BLC) + return logits + + def pick_visual_tokens( + self, + x_BLC, + sequece_packing_scales, + visual_tokens_len, + args, + ): + visual_tokens = x_BLC[:,:visual_tokens_len] + return visual_tokens + + def forward(self, label_B_or_BLT: Union[torch.LongTensor, Tuple[torch.FloatTensor, torch.IntTensor, int]], x_BLC: torch.Tensor, + visual_rope_cache = None, + sequece_packing_scales = None, # [[(1,1,1)->(5,5,5)], [(1,1,1)->(10,10,10)]] 1LC + super_scale_lengths = None, + super_querysid_super_refsid = None, + other_info_by_scale = None, + gt_BL = None, + **kwargs, + ) -> Union[torch.Tensor, List[torch.Tensor]]: # returns logits_BLV + """ + label_B_or_BLT: label_B or (kv_compact, cu_seqlens_k, max_seqlen_k) + :return: logits BLV, V is vocab_size + """ + + x_BLC= x_BLC.float() # input should be float32 + B = x_BLC.shape[0] + cond_BD_or_gss, ca_kv = None, None + + # [1. get input sequence x_BLC] + with torch.amp.autocast('cuda', enabled=False): + kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT + # 12 kv_compact, lens + + must_on_graph = self.cfg_uncond[0, 0] * 0 + kv_compact[0, 0] += must_on_graph + # drop cond + total = 0 + for le in lens: + if random.random() < self.cond_drop_rate: + kv_compact[total:total+le] = self.cfg_uncond[:le] + total += le + + visual_tokens_len = x_BLC.shape[1] + # forms prefix_tokens + kv_compact = self.text_norm(kv_compact) + kv_compact = self.text_proj(kv_compact).contiguous() + x_BLC = self.word_embed(self.norm0_ve(x_BLC)) # norm0_ve is Identity + x_BLC = torch.cat((x_BLC, kv_compact.unsqueeze(0)), dim=1) + + if self.other_args.train_with_var_seq_len: + pad_seq_len = int(np.ceil(x_BLC.shape[1]/self.pad_to_multiplier))*self.pad_to_multiplier - x_BLC.shape[1] + else: + pad_seq_len = self.other_args.train_max_token_len - x_BLC.shape[1] + if pad_seq_len > 0: + x_BLC = F.pad(x_BLC, (0, 0, 0, pad_seq_len), value=0.0) + + # valid_sequence_ratio = 1 - pad_seq_len / self.other_args.train_max_token_len + valid_sequence_ratio = 1 - pad_seq_len / x_BLC.shape[1] + assert self.use_flex_attn + attn_bias_or_two_vector = None + + attn_fn = build_flex_attn_func( + flex_attention=self.flex_attention, + seq_l=x_BLC.shape[1], + prefix_lens=lens, + args=self.other_args, + device=x_BLC.device, + batch_size=B, + heads=None, + pad_seq_len=pad_seq_len, + sequece_packing_scales=sequece_packing_scales, + super_scale_lengths=super_scale_lengths, + super_querysid_super_refsid=super_querysid_super_refsid, + ) + + # calculate rope cache for this iteration + self.rope2d_freqs_grid['freqs_text'] = self.rope2d_freqs_grid['freqs_text'].to(x_BLC.device) + rope_cache_list = [visual_rope_cache] + for i in range(len(lens)): + rope_cache_list.append(self.rope2d_freqs_grid['freqs_text'][:,:,:,:,:lens[i]]) + rope_cache = torch.cat(rope_cache_list, dim=4) + if pad_seq_len > 0: + rope_cache = F.pad(rope_cache, (0,0,0,pad_seq_len), 'constant', 0.) + assert rope_cache.shape[4] == x_BLC.shape[1], f'{rope_cache.shape[4]} != {x_BLC.shape[1]}' + # [2. block loop] + checkpointing_full_block = self.checkpointing == 'full-block' and self.training + + if sp_manager.sp_on(): + # [B, raw_L, C] --> [B, raw_L/sp_size, C] + x_BLC = sp_split_sequence_by_dim(x_BLC, 1) + + if self.num_block_chunks == 1: + for i, b in enumerate(self.blocks): + if checkpointing_full_block: + x_BLC = torch.utils.checkpoint.checkpoint(b, x_BLC, cond_BD_or_gss, ca_kv, attn_bias_or_two_vector, attn_fn, rope_cache, use_reentrant=False) + else: + x_BLC = b(x=x_BLC, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_bias_or_two_vector, attn_fn=attn_fn, rope2d_freqs_grid=rope_cache) + else: + for i, chunk in enumerate(self.block_chunks): # this path + x_BLC = chunk(x=x_BLC, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_bias_or_two_vector, attn_fn=attn_fn, checkpointing_full_block=checkpointing_full_block, rope2d_freqs_grid=rope_cache) + + if sp_manager.sp_on(): + # [B, raw_L/sp_size, C] --> [B, raw_L, C] + x_BLC = sp_gather_sequence_by_dim(x_BLC, 1) + + # [3. unpad the seqlen dim, and then get logits] + x_BLC = self.pick_visual_tokens(x_BLC, sequece_packing_scales, visual_tokens_len, self.other_args) + loss_list, acc_list = self.get_loss_acc(x_BLC, sequece_packing_scales, gt_BL) + return loss_list, acc_list, valid_sequence_ratio + + def prepare_text_conditions( + self, + label_B_or_BLT, + cfg_list, + B, + negative_label_B_or_BLT, + vae_scale_schedule=None, + text_token_only=False, + text_maxlen_this_iter=512, + ): + kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT + bs = B + if any(np.array(cfg_list) != 1): + bs = 2*B + if not negative_label_B_or_BLT: + kv_compact_un = kv_compact.clone() + total = 0 + for le in lens: + kv_compact_un[total:total+le] = (self.cfg_uncond)[:le] + total += le + kv_compact = torch.cat((kv_compact, kv_compact_un), dim=0) + cu_seqlens_k = torch.cat((cu_seqlens_k, cu_seqlens_k[1:]+cu_seqlens_k[-1]), dim=0) + lens = lens + lens + else: + kv_compact_un, lens_un, cu_seqlens_k_un, max_seqlen_k_un = negative_label_B_or_BLT + kv_compact = torch.cat((kv_compact, kv_compact_un), dim=0) + cu_seqlens_k = torch.cat((cu_seqlens_k, cu_seqlens_k_un[1:]+cu_seqlens_k[-1]), dim=0) + max_seqlen_k = max(max_seqlen_k, max_seqlen_k_un) + lens = lens + lens_un + kv_compact = self.text_norm(kv_compact) + kv_compact = self.text_proj(kv_compact).contiguous() + assert B == 1 + prefix_tokens = torch.zeros((bs, text_maxlen_this_iter, self.C), dtype=kv_compact.dtype, device=kv_compact.device) + total = 0 + for i, le in enumerate(lens): + assert le <= text_maxlen_this_iter + prefix_tokens[i,:le] = kv_compact[total:total+le] + total += le + return prefix_tokens, lens + + @torch.no_grad() + def autoregressive_infer( + self, + args=None, + **kwargs, + ): + if 'infinity_elegant' in args.dynamic_scale_schedule: + infer_func = self.ar_infer_infinity_elegant + elif 'infinity_star_interact' in args.dynamic_scale_schedule: + infer_func = self.ar_infer_infinity_star_interact + else: + infer_func = self.autoregressive_infer_cfg + return infer_func(args=args, **kwargs) + + def embeds_codes2input( + self, + last_stage, # [B, d, t, h, w] + repeat=1, + ): + if self.apply_spatial_patchify: # patchify operation + last_stage = last_stage.permute(0,2,1,3,4) # [B, t, d, 2h, 2w] + last_stage = torch.nn.functional.pixel_unshuffle(last_stage, 2) # [B, t, 4d, h, w] + last_stage = last_stage.permute(0,2,1,3,4) # [B, 4d, t, h, w] + last_stage = last_stage.reshape(*last_stage.shape[:2], -1) # [B, d, t*h*w] or [B, 4d, t*h*w] + last_stage = torch.permute(last_stage, [0,2,1]) # [B, t*h*w, d] or [B, t*h*w, 4d] + last_stage = self.word_embed(self.norm0_ve(last_stage)) + last_stage = last_stage.repeat(repeat, 1, 1) + return last_stage + + @torch.no_grad() + def ar_infer_infinity_elegant( + self, + vae=None, + scale_schedule=None, + label_B_or_BLT=None, + B=1, negative_label_B_or_BLT=None, + g_seed=None, cfg_list=[], tau_list=[], top_k=0, top_p=0.0, + trunk_scale=1000, + gt_leak=0, gt_ls_Bl=None, + low_vram_mode=False, + args=None, + get_visual_rope_embeds=None, + context_info=None, + return_summed_code_only=False, + **kwargs, + ): # returns List[idx_Bl] + from infinity.schedules.infinity_elegant import interpolate + if g_seed is None: rng = None + else: self.rng.manual_seed(g_seed); rng = self.rng + assert len(cfg_list) >= len(scale_schedule) + assert len(tau_list) >= len(scale_schedule) + assert args.use_cfg + args.use_apg == 1 + device = label_B_or_BLT[0].device + if self.apply_spatial_patchify: + vae_scale_schedule = [(pt, 2*ph, 2*pw) for pt, ph, pw in scale_schedule] + else: + vae_scale_schedule = scale_schedule + # calculate rope cache for this iteration + self.rope2d_freqs_grid['freqs_text'] = self.rope2d_freqs_grid['freqs_text'].to(device) + text_maxlen_this_iter = label_B_or_BLT[-1] # self.text_maxlen # kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT + prefix_tokens, lens = self.prepare_text_conditions(label_B_or_BLT, cfg_list, B, negative_label_B_or_BLT, vae_scale_schedule, text_token_only=False, text_maxlen_this_iter=text_maxlen_this_iter) + bs = prefix_tokens.shape[0] + ca_kv, cond_BD_or_gss, attn_mask = None, None, None + ret, idx_Bl_list = [], [] # current length, list of reconstructed images + for b in self.unregistered_blocks: b.attn.kv_caching(True) + first_full_spatial_size_scale_index = get_first_full_spatial_size_scale_index(scale_schedule) + image_scale_repetition = np.array(json.loads(args.image_scale_repetition)) + video_scale_repetition = np.array(json.loads(args.video_scale_repetition)) + scales_in_one_clip = first_full_spatial_size_scale_index + 1 + assert len(image_scale_repetition) == len(video_scale_repetition), f'{len(image_scale_repetition)} != {len(video_scale_repetition)}' + assert len(image_scale_repetition) == scales_in_one_clip, f'{len(image_scale_repetition)} != {scales_in_one_clip}' + total_steps = image_scale_repetition.sum() + video_scale_repetition.sum() * (len(scale_schedule)//len(video_scale_repetition)-1) + 1 # +1 is prefix text token forward step + pbar = tqdm.tqdm(total=total_steps) + block_chunks = self.block_chunks if self.num_block_chunks > 1 else self.blocks + + noise_shape = vae_scale_schedule[0] + if self.other_args.noise_input: + noise = torch.randn((1, self.vae_embed_dim, *noise_shape), dtype=prefix_tokens.dtype, device=prefix_tokens.device) + else: + noise = torch.zeros((1, self.vae_embed_dim, *noise_shape), dtype=prefix_tokens.dtype, device=prefix_tokens.device) + + summed_codes = [noise[0:1]] + sos_token = self.embeds_codes2input(noise, bs//1) + # text tokens forward + rope_cache = self.rope2d_freqs_grid['freqs_text'][:,:,:,:,:text_maxlen_this_iter] + last_stage = prefix_tokens + pbar.update(1) + for block_idx, b in enumerate(block_chunks): + last_stage = b(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind='t0', context_info=context_info, last_repetition_step=True) + + # visual tokens forward + ref_text_scale_inds = ['t0'] + last_stage = sos_token + cum_scales = 0 + for si, pn in enumerate(scale_schedule): # si: i-th segment + rel_si_in_one_clip = si % scales_in_one_clip + if si < scales_in_one_clip: # image + repeat_times = image_scale_repetition[si%scales_in_one_clip] + target_pn = vae_scale_schedule[first_full_spatial_size_scale_index] + else: + repeat_times = video_scale_repetition[si%scales_in_one_clip] + target_pn = vae_scale_schedule[-1] + cfg = cfg_list[si] + infer_repeat_times = min(repeat_times, args.max_repeat_times) + for repeat_idx in range(infer_repeat_times): + # print(f'real scale ind is : {cum_scales+repeat_idx}') + rope_cache = get_visual_rope_embeds(self.rope2d_freqs_grid, scale_schedule, si, cum_scales+repeat_idx, device, args, context_info, first_full_spatial_size_scale_index) + pbar.update(1) + last_repetition_step = (repeat_idx == (infer_repeat_times-1)) + for block_idx, b in enumerate(block_chunks): + last_stage = b(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind=si, context_info=context_info, last_repetition_step=last_repetition_step, ref_text_scale_inds=ref_text_scale_inds) + logits_BlV = self.get_logits_during_infer(last_stage, is_semantic_scale=rel_si_in_one_clip < args.semantic_scales).mul(1/tau_list[si]) + if cfg != 1: + # print(f'add cfg on add_cfg_on_logits') + if args.use_cfg: + logits_BlV = cfg * logits_BlV[:B] + (1-cfg) * logits_BlV[B:] + elif args.use_apg: + pred_cond = logits_BlV[:B] + pred_uncond = logits_BlV[B:] + pred_guided = normalized_guidance(pred_cond, pred_uncond, guidance_scale=cfg, momentum_buffer=None, eta=0, norm_threshold=args.apg_norm_threshold) + # pred_guided = cfg * pred_cond + (1-cfg) * pred_uncond + logits_BlV = pred_guided + else: + logits_BlV = logits_BlV[:B] + + tmp_bs, tmp_seq_len = logits_BlV.shape[:2] + logits_BlV = logits_BlV.reshape(tmp_bs, -1, self.num_of_label_value) + probs_Bld = logits_BlV.softmax(dim=-1) # [B, thwd or thw4d, 2] + idx_Bld = torch.multinomial(probs_Bld.view(-1, self.num_of_label_value), num_samples=1, replacement=True, generator=rng).view(tmp_bs, -1) # [B, thwd or thw4d] + probs_Bld = torch.gather(probs_Bld, dim=2, index=idx_Bld.unsqueeze(-1)).squeeze(-1) + + def Bld2Bthwd(item): + item = item.reshape(tmp_bs, tmp_seq_len, -1) # [B, thw, d or 4d] + item = item.reshape(B, pn[0], pn[1], pn[2], -1) # shape: [B, t, h, w, d] or [B, t, h, w, 4d] + if self.apply_spatial_patchify: # unpatchify operation + item = item.permute(0,1,4,2,3) # [B, t, 4d, h, w] + item = torch.nn.functional.pixel_shuffle(item, 2) # [B, t, d, 2h, 2w] + item = item.permute(0,1,3,4,2) # [B, t, 2h, 2w, d] + return item + + idx_Bld = Bld2Bthwd(idx_Bld) + probs_Bld = Bld2Bthwd(probs_Bld) + # print(f'{si=} {repeat_idx=} idx_Bld.shape={idx_Bld.shape}') + + if si < gt_leak: + idx_Bld = gt_ls_Bl[cum_scales+repeat_idx] + # idx_Bld [B, t, h, w, d] or [B, t, 2h, 2w, d] + if self.other_args.use_two_stage_lfq: + if pn[1] * pn[2] >= vae.quantizer.detail_scale_min_tokens: + is_semantic_scale = False + lfq = vae.quantizer.lfq_detail + else: + is_semantic_scale = True + lfq = vae.quantizer.lfq_semantic + codes = lfq.indices_to_codes(idx_Bld, 'bit_label') + codes = interpolate(codes, size=(self.vae_embed_dim, *target_pn), mode=vae.quantizer.z_interplote_up, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + codes = vae.quantizer.lfq_detail.indices_to_codes(idx_Bld, 'bit_label') + codes = F.interpolate(codes, size=target_pn, mode=vae.quantizer.z_interplote_up) + summed_codes[-1] = F.interpolate(summed_codes[-1], size=target_pn, mode=vae.quantizer.z_interplote_up) + summed_codes[-1] += codes + if repeat_idx < repeat_times - 1: + last_stage = F.interpolate(summed_codes[-1], size=vae_scale_schedule[si], mode=vae.quantizer.z_interplote_down) + last_stage = self.embeds_codes2input(last_stage, bs//B) + cum_scales += repeat_times + if si < len(scale_schedule)-1: + if scale_schedule[si][-2:] == scale_schedule[-1][-2:]: + if self.other_args.noise_input: + summed_codes.append(torch.randn((B, summed_codes[-1].shape[1], *vae_scale_schedule[si+1]), device=summed_codes[-1].device, dtype=summed_codes[-1].dtype)) + else: + summed_codes.append(torch.zeros((B, summed_codes[-1].shape[1], *vae_scale_schedule[si+1]), device=summed_codes[-1].device, dtype=summed_codes[-1].dtype)) + last_stage = summed_codes[-1] + else: + last_stage = F.interpolate(summed_codes[-1], size=vae_scale_schedule[si+1], mode=vae.quantizer.z_interplote_down) + last_stage = self.embeds_codes2input(last_stage, bs//B) + summed_codes = torch.cat(summed_codes, dim=-3) + for b in self.unregistered_blocks: b.attn.kv_caching(False) + if return_summed_code_only: + return summed_codes + else: + if low_vram_mode: vae.to('cuda') + img = self.summed_codes2images(vae, summed_codes) + return idx_Bl_list, img + + + @torch.no_grad() + def ar_infer_infinity_star_interact( + self, + vae=None, + scale_schedule=None, + label_B_or_BLT=None, + B=1, negative_label_B_or_BLT=None, + g_seed=None, cfg_list=[], tau_list=[], top_k=0, top_p=0.0, + trunk_scale=1000, + gt_leak=0, gt_ls_Bl=None, + low_vram_mode=False, + args=None, + get_visual_rope_embeds=None, + context_info=None, + return_summed_code_only=False, + mode='', + former_clip_features=None, + first_frame_features=None, + semantic_scale_ind = 7, + detail_frame_inds = [18,19], + **kwargs, + ): # returns List[idx_Bl] + from infinity.schedules.infinity_star_interact import interpolate + assert len(cfg_list) >= len(scale_schedule) + assert len(tau_list) >= len(scale_schedule) + assert args.use_apg + args.use_cfg == 1 + device = label_B_or_BLT[0].device + if g_seed is None: + rng = None + else: + self.rng = torch.Generator(device=device) + self.rng.manual_seed(g_seed) + rng = self.rng + + if self.apply_spatial_patchify: + vae_scale_schedule = [(pt, 2*ph, 2*pw) for pt, ph, pw in scale_schedule] + else: + vae_scale_schedule = scale_schedule + # calculate rope cache for this iteration + self.rope2d_freqs_grid['freqs_text'] = self.rope2d_freqs_grid['freqs_text'].to(device) + text_maxlen_this_iter = label_B_or_BLT[-1] # self.text_maxlen # kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT + prefix_tokens, _ = self.prepare_text_conditions(label_B_or_BLT, cfg_list, B, negative_label_B_or_BLT, vae_scale_schedule, text_token_only=False, text_maxlen_this_iter=text_maxlen_this_iter) + bs = prefix_tokens.shape[0] + + ca_kv, cond_BD_or_gss, attn_mask = None, None, None + for b in self.unregistered_blocks: b.attn.kv_caching(True) + first_full_spatial_size_scale_index = get_first_full_spatial_size_scale_index(scale_schedule) + image_scale_repetition = np.array(json.loads(args.image_scale_repetition)) + video_scale_repetition = np.array(json.loads(args.video_scale_repetition)) + scales_in_one_clip = first_full_spatial_size_scale_index + 1 + assert len(image_scale_repetition) == len(video_scale_repetition), f'{len(image_scale_repetition)} != {len(video_scale_repetition)}' + assert len(image_scale_repetition) == scales_in_one_clip, f'{len(image_scale_repetition)} != {scales_in_one_clip}' + total_steps = image_scale_repetition.sum() + video_scale_repetition.sum() * (len(scale_schedule)//len(video_scale_repetition)-1) + 1 # +1 is prefix text token forward step + if mode == 'second_v_clip': + total_steps += 2 + pbar = tqdm.tqdm(total=total_steps) + block_chunks = self.block_chunks if self.num_block_chunks > 1 else self.blocks + + noise_shape = vae_scale_schedule[0] + if self.other_args.noise_input: + noise = torch.randn((1, self.vae_embed_dim, *noise_shape), dtype=prefix_tokens.dtype, device=prefix_tokens.device) + else: + noise = torch.zeros((1, self.vae_embed_dim, *noise_shape), dtype=prefix_tokens.dtype, device=prefix_tokens.device) + + summed_codes = [noise[0:1]] + sos_token = self.embeds_codes2input(noise, bs//1) + # text tokens forward + rope_cache = self.rope2d_freqs_grid['freqs_text'][:,:,:,:,:text_maxlen_this_iter] + last_stage = prefix_tokens + for block_idx, b in enumerate(block_chunks): + last_stage = b(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind=f't0', context_info=context_info, last_repetition_step=True) + pbar.update(1) + + ref_text_scale_inds = ['t0'] + + # visual condition forward + if mode == 'second_v_clip': + assert former_clip_features.shape[-3] == 21 + former_clip_features = former_clip_features[:,:,1:] + last_stage = F.interpolate(former_clip_features, size=(20, *vae_scale_schedule[semantic_scale_ind][-2:]), mode=vae.quantizer.z_interplote_down) + rope_cache = get_visual_rope_embeds(self.rope2d_freqs_grid, scale_schedule[-1], last_stage.shape[-3:], list(range(1, 21)), 800, device) + last_stage = self.embeds_codes2input(last_stage, bs//B) + for block_idx, b in enumerate(block_chunks): + last_stage = b(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind=f'semantic_condition', context_info=context_info, last_repetition_step=True) + pbar.update(1) + + last_stage = torch.cat([first_frame_features, former_clip_features[:,:,detail_frame_inds]], dim=2) + rope_cache = get_visual_rope_embeds(self.rope2d_freqs_grid, scale_schedule[-1], last_stage.shape[-3:], [0]+[item+1 for item in detail_frame_inds], 801, device) + last_stage = self.embeds_codes2input(last_stage, bs//B) + for block_idx, b in enumerate(block_chunks): + last_stage = b(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind=f'detail_condition', context_info=context_info, last_repetition_step=True) + pbar.update(1) + + ref_text_scale_inds.extend(['semantic_condition', 'detail_condition']) + + # visual tokens forward + last_stage = sos_token + cum_scales = 0 + for si, pn in enumerate(scale_schedule): # si: i-th segment + rel_si_in_one_clip = si % scales_in_one_clip + if si < scales_in_one_clip: # image + repeat_times = image_scale_repetition[rel_si_in_one_clip] + target_pn = vae_scale_schedule[first_full_spatial_size_scale_index] + else: + repeat_times = video_scale_repetition[rel_si_in_one_clip] + target_pn = vae_scale_schedule[-1] + cfg = cfg_list[si] + infer_repeat_times = min(repeat_times, args.max_repeat_times) + for repeat_idx in range(infer_repeat_times): + frame_ss, frame_ee = context_info[si]['frame_ss'], context_info[si]['frame_ee'] + rope_cache = get_visual_rope_embeds(self.rope2d_freqs_grid, scale_schedule[-1], scale_schedule[si], list(range(frame_ss, frame_ee)), cum_scales+repeat_idx, device) + last_repetition_step = (repeat_idx == (infer_repeat_times-1)) + for block_idx, b in enumerate(block_chunks): + last_stage = b(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind=si, context_info=context_info, last_repetition_step=last_repetition_step, ref_text_scale_inds=ref_text_scale_inds) + logits_BlV = self.get_logits_during_infer(last_stage, is_semantic_scale=rel_si_in_one_clip < args.semantic_scales).mul(1/tau_list[si]) + if cfg != 1: + # print(f'add cfg on add_cfg_on_logits') + if args.use_cfg: + logits_BlV = cfg * logits_BlV[:B] + (1-cfg) * logits_BlV[B:] + elif args.use_apg: + pred_cond = logits_BlV[:B] + pred_uncond = logits_BlV[B:] + pred_guided = normalized_guidance(pred_cond, pred_uncond, guidance_scale=cfg, momentum_buffer=None, eta=0, norm_threshold=args.apg_norm_threshold) + # pred_guided = cfg * pred_cond + (1-cfg) * pred_uncond + logits_BlV = pred_guided + else: + logits_BlV = logits_BlV[:B] + + tmp_bs, tmp_seq_len = logits_BlV.shape[:2] + logits_BlV = logits_BlV.reshape(tmp_bs, -1, self.num_of_label_value) + probs_Bld = logits_BlV.softmax(dim=-1) # [B, thwd or thw4d, 2] + idx_Bld = torch.multinomial(probs_Bld.view(-1, self.num_of_label_value), num_samples=1, replacement=True, generator=rng).view(tmp_bs, -1) # [B, thwd or thw4d] + probs_Bld = torch.gather(probs_Bld, dim=2, index=idx_Bld.unsqueeze(-1)).squeeze(-1) + + def Bld2Bthwd(item): + item = item.reshape(tmp_bs, tmp_seq_len, -1) # [B, thw, d or 4d] + item = item.reshape(B, pn[0], pn[1], pn[2], -1) # shape: [B, t, h, w, d] or [B, t, h, w, 4d] + if self.apply_spatial_patchify: # unpatchify operation + item = item.permute(0,1,4,2,3) # [B, t, 4d, h, w] + item = torch.nn.functional.pixel_shuffle(item, 2) # [B, t, d, 2h, 2w] + item = item.permute(0,1,3,4,2) # [B, t, 2h, 2w, d] + return item + + idx_Bld = Bld2Bthwd(idx_Bld) + probs_Bld = Bld2Bthwd(probs_Bld) + + if si < gt_leak: + acc = (idx_Bld==gt_ls_Bl[cum_scales+repeat_idx]).float().mean() * 100. + idx_Bld = gt_ls_Bl[cum_scales+repeat_idx] + print(f'{si=} {repeat_idx=} idx_Bld.shape={idx_Bld.shape} {acc=}%') + + # idx_Bld [B, t, h, w, d] or [B, t, 2h, 2w, d] + if self.other_args.use_two_stage_lfq: + if si >= args.semantic_scales: + is_semantic_scale = False + lfq = vae.quantizer.lfq_detail + else: + is_semantic_scale = True + lfq = vae.quantizer.lfq_semantic + codes = lfq.indices_to_codes(idx_Bld, 'bit_label') + codes = interpolate(codes, size=(self.vae_embed_dim, *target_pn), mode=vae.quantizer.z_interplote_up, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + codes = vae.quantizer.lfq_detail.indices_to_codes(idx_Bld, 'bit_label') + codes = F.interpolate(codes, size=target_pn, mode=vae.quantizer.z_interplote_up) + summed_codes[-1] = F.interpolate(summed_codes[-1], size=target_pn, mode=vae.quantizer.z_interplote_up) + summed_codes[-1] += codes + if repeat_idx < repeat_times - 1: + last_stage = F.interpolate(summed_codes[-1], size=vae_scale_schedule[si], mode=vae.quantizer.z_interplote_down) + last_stage = self.embeds_codes2input(last_stage, bs//B) + pbar.update(1) + cum_scales += repeat_times + if si < len(scale_schedule)-1: + if scale_schedule[si][-2:] == scale_schedule[-1][-2:]: + if self.other_args.noise_input: + summed_codes.append(torch.randn((B, summed_codes[-1].shape[1], *vae_scale_schedule[si+1]), device=summed_codes[-1].device, dtype=summed_codes[-1].dtype)) + else: + summed_codes.append(torch.zeros((B, summed_codes[-1].shape[1], *vae_scale_schedule[si+1]), device=summed_codes[-1].device, dtype=summed_codes[-1].dtype)) + last_stage = summed_codes[-1] + else: + last_stage = F.interpolate(summed_codes[-1], size=vae_scale_schedule[si+1], mode=vae.quantizer.z_interplote_down) + last_stage = self.embeds_codes2input(last_stage, bs//B) + summed_codes = torch.cat(summed_codes, dim=-3) + for b in self.unregistered_blocks: b.attn.kv_caching(False) + if mode == 'second_v_clip': + this_clip_frames = summed_codes.shape[2] * 4 + summed_codes = torch.cat([former_clip_features, summed_codes], dim=-3) + img = self.summed_codes2images(vae, summed_codes) # [bs, t, h, w, 3] + img = img[:,-this_clip_frames:] + summed_codes = summed_codes[:,:,-21:] + assert summed_codes.shape[2] == 21, f'wrong shape: {summed_codes.shape=}' + else: + img = self.summed_codes2images(vae, summed_codes) + + if low_vram_mode: vae.to('cuda') + return summed_codes, img + + @torch.no_grad() + def autoregressive_infer_cfg( + self, + vae=None, + scale_schedule=None, + label_B_or_BLT=None, + B=1, negative_label_B_or_BLT=None, + g_seed=None, cfg_list=[], tau_list=[], top_k=0, top_p=0.0, + returns_vemb=0, + trunk_scale=1000, + gt_leak=0, gt_ls_Bl=None, + low_vram_mode=False, + args=None, + get_visual_rope_embeds=None, + **kwargs, + ): # returns List[idx_Bl] + if g_seed is None: rng = None + else: self.rng.manual_seed(g_seed); rng = self.rng + assert len(cfg_list) >= len(scale_schedule) + assert len(tau_list) >= len(scale_schedule) + assert args.use_cfg + args.use_apg == 1 + device = label_B_or_BLT[0].device + if self.apply_spatial_patchify: + vae_scale_schedule = [(pt, 2*ph, 2*pw) for pt, ph, pw in scale_schedule] + else: + vae_scale_schedule = scale_schedule + # calculate rope cache for this iteration + self.rope2d_freqs_grid['freqs_text'] = self.rope2d_freqs_grid['freqs_text'].to(device) + text_maxlen_this_iter = self.text_maxlen + last_stage, lens, _ = self.prepare_text_conditions(label_B_or_BLT, cfg_list, B, negative_label_B_or_BLT, args.input_noise, vae_scale_schedule) + bs = last_stage.shape[0] + ca_kv, cond_BD_or_gss = None, None + ret, idx_Bl_list = [], [] # current length, list of reconstructed images + for b in self.unregistered_blocks: b.attn.kv_caching(True) + summed_codes = 0 + for si, pn in enumerate(scale_schedule): # si: i-th segment + visual_rope_cache = get_visual_rope_embeds(self.rope2d_freqs_grid, scale_schedule, si, device, args) + if si == 0: + rope_cache = torch.cat([self.rope2d_freqs_grid['freqs_text'][:,:,:,:,:text_maxlen_this_iter], visual_rope_cache], dim=4) + else: + rope_cache = visual_rope_cache + attn_mask = torch.ones((last_stage.shape[0], 1, last_stage.shape[1], text_maxlen_this_iter+np.array(pn).prod()), device=last_stage.device).bool() # [bs, q_heads, q_len, all_k_len], here set q_heads=1 for broadcasting + assert len(attn_mask) == len(lens) + for tmp_i, le in enumerate(lens): + attn_mask[tmp_i, :, :, le:text_maxlen_this_iter] = False + if si == 0: + attn_mask[tmp_i, :, :text_maxlen_this_iter, text_maxlen_this_iter:] = False + cfg = cfg_list[si] + if si >= trunk_scale: + break + for block_idx, b in enumerate(self.block_chunks): + for m in b.module: + last_stage = m(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_mask, attn_fn=None, scale_schedule=scale_schedule, rope2d_freqs_grid=rope_cache, scale_ind=si) + if si == 0: + last_stage = last_stage[:, text_maxlen_this_iter:] + # import pdb; pdb.set_trace() + if cfg != 1: + # print(f'add cfg on add_cfg_on_logits') + logits_BlV = self.get_logits(last_stage).mul(1/tau_list[si]) + if args.use_cfg: + logits_BlV = cfg * logits_BlV[:B] + (1-cfg) * logits_BlV[B:] + elif args.use_apg: + pred_cond = logits_BlV[:B] + pred_uncond = logits_BlV[B:] + pred_guided = normalized_guidance(pred_cond, pred_uncond, guidance_scale=cfg, momentum_buffer=None, eta=0, norm_threshold=10) + # pred_guided = cfg * pred_cond + (1-cfg) * pred_uncond + logits_BlV = pred_guided + else: + logits_BlV = self.get_logits(last_stage[:B]).mul(1/tau_list[si]) + if self.num_of_label_value == 1: + idx_Bld = logits_BlV + elif self.num_of_label_value > 1: + tmp_bs, tmp_seq_len = logits_BlV.shape[:2] + logits_BlV = logits_BlV.reshape(tmp_bs, -1, self.num_of_label_value) + idx_Bld = sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV, rng=rng, top_k=top_k or self.top_k, top_p=top_p or self.top_p, num_samples=1)[:, :, 0] + idx_Bld = idx_Bld.reshape(tmp_bs, tmp_seq_len, -1) + elif self.num_of_label_value == 0: + idx_Bl = sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV, rng=rng, top_k=top_k or self.top_k, top_p=top_p or self.top_p, num_samples=1)[:, :, 0] + assert returns_vemb + if si < gt_leak: + idx_Bld = gt_ls_Bl[si] + else: + idx_Bld = idx_Bld.reshape(B, pn[0], pn[1], pn[2], -1) # shape: [B, t, h, w, d] or [B, t, h, w, 4d] + if self.apply_spatial_patchify: # unpatchify operation + idx_Bld = idx_Bld.permute(0,1,4,2,3) # [B, t, 4d, h, w] + idx_Bld = torch.nn.functional.pixel_shuffle(idx_Bld, 2) # [B, t, d, 2h, 2w] + idx_Bld = idx_Bld.permute(0,1,3,4,2) # [B, t, 2h, 2w, d] + # idx_Bld [B, t, h, w, d] or [B, t, 2h, 2w, d] + + # idx_Bld_list.append(idx_Bld) + if self.num_of_label_value == 1: + if si < gt_leak: + codes = vae.quantizer.lfq_detail.indices_to_codes(idx_Bld, label_type='bit_label') # [B, d, t, h, w] or [B, d, t, 2h, 2w] + else: + codes = idx_Bld.permute(0,4,1,2,3) + else: + codes = vae.quantizer.lfq_detail.indices_to_codes(idx_Bld, label_type='bit_label') # [B, d, t, h, w] or [B, d, t, 2h, 2w] + if vae_scale_schedule[si] != vae_scale_schedule[-1]: + codes = F.interpolate(codes, size=vae_scale_schedule[-1], mode=vae.quantizer.z_interplote_up) + summed_codes += codes + if si < len(scale_schedule)-1: + last_stage = F.interpolate(summed_codes, size=vae_scale_schedule[si+1], mode=vae.quantizer.z_interplote_down) # [B, d, t, h, w] or [B, d, t, 2h, 2w] + if self.apply_spatial_patchify: # patchify operation + last_stage = last_stage.permute(0,2,1,3,4) # [B, t, d, 2h, 2w] + last_stage = torch.nn.functional.pixel_unshuffle(last_stage, 2) # [B, t, 4d, h, w] + last_stage = last_stage.permute(0,2,1,3,4) # [B, 4d, t, h, w] + last_stage = last_stage.reshape(*last_stage.shape[:2], -1) # [B, d, t*h*w] or [B, 4d, t*h*w] + last_stage = torch.permute(last_stage, [0,2,1]) # [B, t*h*w, d] or [B, t*h*w, 4d] + last_stage = self.word_embed(self.norm0_ve(last_stage)) + last_stage = last_stage.repeat(bs//B, 1, 1) + for b in self.unregistered_blocks: b.attn.kv_caching(False) + if low_vram_mode: vae.to('cuda') + img = self.summed_codes2images(vae, summed_codes) + return ret, idx_Bl_list, img + + def summed_codes2images(self, vae, summed_codes): + t1 = time.time() + + img = vae.decode(summed_codes, slice=True) + img = (img + 1) / 2 + img = torch.clamp(img, 0, 1) + img = img.permute(0,2,3,4,1) # [bs, 3, t, h, w] -> [bs, t, h, w, 3] + img = img.mul_(255).to(torch.uint8).flip(dims=(4,)) + + # smooth the image & video + img[:, 0:1, :, :, :] = img[:, 1:2, :, :, :] + + print(f'Decode takes {time.time()-t1:.1f}s') + return img + + @for_visualize + def vis_key_params(self, ep): + return + + def load_state_dict(self, state_dict: Dict[str, Any], strict=False, assign=False): + for k in state_dict: + if 'cfg_uncond' in k: + old, new = state_dict[k], self.cfg_uncond.data + min_tlen = min(old.shape[0], new.shape[0]) + if min_tlen == old.shape[0]: + state_dict[k] = torch.cat((old.to(device=new.device, dtype=new.dtype), new[min_tlen:])) + else: + state_dict[k] = old[:min_tlen] + + for buf_name in ('lvl_1L', 'attn_bias_for_masking', 'Infinity_visible_kvlen', 'Infinity_invisible_qlen'): + state_dict.pop(buf_name, None) + if hasattr(self, buf_name): + state_dict[buf_name] = getattr(self, buf_name) + + return super().load_state_dict(state_dict=state_dict, strict=strict, assign=assign) + + def special_init(self): + if self.arch == 'qwen': + std = 0.02 + for module in self.modules(): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + else: + raise ValueError(f'Unknown arch {self.arch}') + + def extra_repr(self): + return f'' + + def get_layer_id_and_scale_exp(self, para_name: str): + raise NotImplementedError + + +def sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor: # return idx, shaped (B, l) + B, l, V = logits_BlV.shape + if top_k > 0: + top_k = min(top_k, V) + idx_to_remove = logits_BlV < logits_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True) + logits_BlV.masked_fill_(idx_to_remove, -torch.inf) + if top_p > 0: + sorted_logits, sorted_idx = logits_BlV.sort(dim=-1, descending=False) + sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p) + sorted_idx_to_remove[..., -1:] = False + logits_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), -torch.inf) + # sample (have to squeeze cuz multinomial can only be used on 2D tensor) + replacement = num_samples >= 0 + num_samples = abs(num_samples) + return torch.multinomial(logits_BlV.softmax(dim=-1).view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples) + +def sampling_with_top_k_top_p_also_inplace_modifying_probs_(probs_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor: # return idx, shaped (B, l) + B, l, V = probs_BlV.shape + if top_k > 0: + top_k = min(top_k, V) + idx_to_remove = probs_BlV < probs_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True) + probs_BlV.masked_fill_(idx_to_remove, 0) + if top_p > 0: + sorted_probs, sorted_idx = probs_BlV.sort(dim=-1, descending=False) + sorted_idx_to_remove = sorted_probs.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p) + sorted_idx_to_remove[..., -1:] = False + probs_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), 0) + # sample (have to squeeze cuz multinomial can only be used on 2D tensor) + probs_BlV = probs_BlV / probs_BlV.sum(-1, keepdims=True) + replacement = num_samples >= 0 + num_samples = abs(num_samples) + return torch.multinomial(probs_BlV.view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples) + + +def get_params_num(d, w, mlp): + m = round(mlp * w / 256) * 256 + s = d * (w**2 * 8 + w*m * 2) # sa+ca, mlp + s += w**2 * 6 # saln + s += 4096 * w # pred + s += 32 * w # we + + Ct5 = 4096 + s += Ct5*w * 4 # T5 attn pool + s += Ct5*w + w*w # T5 mlp + return f'{s/1e9:.2f}B' + + +TIMM_KEYS = {'img_size', 'pretrained', 'pretrained_cfg', 'pretrained_cfg_overlay', 'global_pool'} + +@register_model +def infinity_2b(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_sa2b(depth=28, block_chunks=7, embed_dim=2560, num_heads=2560//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, block_chunks=block_chunks, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_sa8b(depth=42, block_chunks=7, embed_dim=4096, num_heads=4096//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, block_chunks=block_chunks, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_sa14b(depth=40, block_chunks=8, embed_dim=5120, num_heads=5120//128, drop_path_rate=0.1, mlp_ratio=3.4, **kwargs): + return Infinity( + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + # (depth=40, block_chunks=8, embed_dim=5120, num_heads=5120//128, num_key_value_heads=5120//128//4, drop_path_rate=0, **kwargs) + +@register_model +def infinity_sa12b(depth=60, embed_dim=4096, num_heads=4096//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_sa16b(depth=42, embed_dim=4096, num_heads=4096//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_v2b(depth=32, embed_dim=2016, num_heads=2016//126, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_8b(depth=40, block_chunks=1, embed_dim=3584, num_heads=3584//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, block_chunks=block_chunks, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +@register_model +def infinity_qwen7b(depth=36, block_chunks=6, embed_dim=4096, num_heads=4096//128, num_key_value_heads=4096//128//4, mlp_ratio=12288/4096, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=mlp_ratio, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen8b(depth=36, block_chunks=6, embed_dim=4096, num_heads=4096//128, num_key_value_heads=4096//128//4, mlp_ratio=4, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=mlp_ratio, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen_wide14b(depth=36, block_chunks=6, embed_dim=5632, num_heads=5632//128, num_key_value_heads=5632//128//4, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=3.4, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen13bMHA(depth=40, block_chunks=8, embed_dim=5120, num_heads=5120//128, num_key_value_heads=5120//128, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + qwen_qkvo_bias=True, + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=3.4, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen2_2b(depth=28, block_chunks=7, embed_dim=2304, num_heads=2304//128, num_key_value_heads=2304//128, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + qwen_qkvo_bias=False, + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=3.55, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen0b(depth=4, block_chunks=2, embed_dim=512, num_heads=512//128, num_key_value_heads=512//128, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + qwen_qkvo_bias=False, + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=3.55, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen2_30b(depth=54, block_chunks=27, embed_dim=6144, num_heads=6144//128, num_key_value_heads=6144//128//4, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + qwen_qkvo_bias=False, + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=4, #mlp_ratio=3.55, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_qwen14b(depth=48, block_chunks=24, embed_dim=4608, num_heads=4608//128, num_key_value_heads=4608//128//4, drop_path_rate=0, **kwargs): + return Infinity( + arch='qwen', + qwen_qkvo_bias=False, + depth=depth, + block_chunks=block_chunks, + embed_dim=embed_dim, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + mlp_ratio=4, + drop_path_rate=drop_path_rate, + **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS} + ) + +@register_model +def infinity_20b(depth=58, embed_dim=4608, num_heads=4608//128, drop_path_rate=0.25, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) + +# model configuration for scaling Infinity transformer +@register_model +def infinity_layer12(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, **kwargs): + return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) +@register_model +def infinity_layer16(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, **kwargs): + return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) +@register_model +def infinity_layer24(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, **kwargs): + return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) +@register_model +def infinity_layer32(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, **kwargs): + return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) +@register_model +def infinity_layer40(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, **kwargs): + return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) +@register_model +def infinity_layer48(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, **kwargs): + return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS}) diff --git a/Meissonic/InfinityStar/infinity/models/init_param.py b/Meissonic/InfinityStar/infinity/models/init_param.py new file mode 100644 index 0000000000000000000000000000000000000000..4221d734ae53463edade10174d2917c903fee0b9 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/init_param.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import torch.nn as nn + + +def init_weights(model: nn.Module, conv_std_or_gain: float = 0.02, other_std: float = 0.02): + """ + :param model: the model to be inited + :param conv_std_or_gain: how to init every conv layer `m` + > 0: nn.init.trunc_normal_(m.weight.data, std=conv_std_or_gain) + < 0: nn.init.xavier_normal_(m.weight.data, gain=-conv_std_or_gain) + :param other_std: how to init every linear layer or embedding layer + use nn.init.trunc_normal_(m.weight.data, std=other_std) + """ + skip = abs(conv_std_or_gain) > 10 + if skip: return + print(f'[init_weights] {type(model).__name__} with {"std" if conv_std_or_gain > 0 else "gain"}={abs(conv_std_or_gain):g}') + for m in model.modules(): + if isinstance(m, nn.Linear): + nn.init.trunc_normal_(m.weight.data, std=other_std) + if m.bias is not None: + nn.init.constant_(m.bias.data, 0.) + elif isinstance(m, nn.Embedding): + nn.init.trunc_normal_(m.weight.data, std=other_std) + if m.padding_idx is not None: + m.weight.data[m.padding_idx].zero_() + elif isinstance(m, (nn.Conv1d, nn.Conv2d, nn.ConvTranspose1d, nn.ConvTranspose2d)): + nn.init.trunc_normal_(m.weight.data, std=conv_std_or_gain) if conv_std_or_gain > 0 else nn.init.xavier_normal_(m.weight.data, gain=-conv_std_or_gain) # todo: StyleSwin: (..., gain=.02) + if hasattr(m, 'bias') and m.bias is not None: + nn.init.constant_(m.bias.data, 0.) + elif isinstance(m, (nn.LayerNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm, nn.GroupNorm, nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)): + if m.bias is not None: + nn.init.constant_(m.bias.data, 0.) + if m.weight is not None: + nn.init.constant_(m.weight.data, 1.) diff --git a/Meissonic/InfinityStar/infinity/models/rope.py b/Meissonic/InfinityStar/infinity/models/rope.py new file mode 100644 index 0000000000000000000000000000000000000000..4e73bfdf92286c5293f1eb2a56ea7fa927f05da2 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/rope.py @@ -0,0 +1,248 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import math +import os +from functools import partial +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from timm.models.layers import DropPath, drop_path +from torch.utils.checkpoint import checkpoint +from infinity.schedules.dynamic_resolution import get_first_full_spatial_size_scale_index + + +def precompute_rope2d_freqs_grid(dim, dynamic_resolution_h_w, rope2d_normalized_by_hw, pad_to_multiplier=1, max_height=2048 // 16, max_width=2048 // 16, base=10000.0, device=None, scaling_factor=1.0, activated_h_div_w_templates=[]): + # split the dimension into half, one for x and one for y + half_dim = dim // 2 + inv_freq = 1.0 / (base ** (torch.arange(0, half_dim, 2, dtype=torch.int64).float().to(device) / half_dim)) # namely theta, 1 / (10000^(i/half_dim)), i=0,2,..., half_dim-2 + t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq) + t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq) + t_height = t_height / scaling_factor + freqs_height = torch.outer(t_height, inv_freq) # (max_height, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely y*theta + t_width = t_width / scaling_factor + freqs_width = torch.outer(t_width, inv_freq) # (max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely x*theta + freqs_grid_map = torch.concat([ + freqs_height[:, None, :].expand(-1, max_width, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2) + freqs_width[None, :, :].expand(max_height, -1, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2) + ], dim=-1) # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d)) + freqs_grid_map = torch.stack([torch.cos(freqs_grid_map), torch.sin(freqs_grid_map)], dim=0) + # (2, max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d)) + + rope2d_freqs_grid = {} + for h_div_w in activated_h_div_w_templates: + assert h_div_w in dynamic_resolution_h_w, f'Unknown h_div_w: {h_div_w}' + scale_schedule = dynamic_resolution_h_w[h_div_w]['1M']['image_scales'] + _, ph, pw = scale_schedule[-1] + max_edge_length = freqs_grid_map.shape[1] + if ph >= pw: + uph, upw = max_edge_length, int(max_edge_length / ph * pw) + else: + uph, upw = int(max_edge_length / pw * ph), max_edge_length + rope_cache_list = [] + for (_, ph, pw) in scale_schedule: + ph_mul_pw = ph * pw + if rope2d_normalized_by_hw == 1: # downsample + rope_cache = F.interpolate(freqs_grid_map[:, :uph, :upw, :].permute([0,3,1,2]), size=(ph, pw), mode='bilinear', align_corners=True) + rope_cache = rope_cache.permute([0,2,3,1]) # (2, ph, pw, half_head_dim) + elif rope2d_normalized_by_hw == 2: # star stylee + _, uph, upw = scale_schedule[-1] + indices = torch.stack([ + (torch.arange(ph) * (uph / ph)).reshape(ph, 1).expand(ph, pw), + (torch.arange(pw) * (upw / pw)).reshape(1, pw).expand(ph, pw), + ], dim=-1).round().int() # (ph, pw, 2) + indices = indices.reshape(-1, 2) # (ph*pw, 2) + rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], :] # (2, ph*pw, half_head_dim) + rope_cache = rope_cache.reshape(2, ph, pw, -1) + elif rope2d_normalized_by_hw == 0: + rope_cache = freqs_grid_map[:, :ph, :pw, :] # (2, ph, pw, half_head_dim) + else: + raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}') + rope_cache_list.append(rope_cache.reshape(2, ph_mul_pw, -1)) + cat_rope_cache = torch.cat(rope_cache_list, 1) # (2, seq_len, half_head_dim) + if cat_rope_cache.shape[1] % pad_to_multiplier: + pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache.shape[1] % pad_to_multiplier, half_dim) + cat_rope_cache = torch.cat([cat_rope_cache, pad], dim=1) + cat_rope_cache = cat_rope_cache[:,None,None,None] # (2, 1, 1, 1, seq_len, half_dim) + for pn in dynamic_resolution_h_w[h_div_w]: + scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['image_scales'] + tmp_scale_schedule = [(1, h, w) for _, h, w in scale_schedule] + rope2d_freqs_grid[str(tuple(tmp_scale_schedule))] = cat_rope_cache + return rope2d_freqs_grid + + +def precompute_rope3d_freqs_grid(dim, dynamic_resolution_h_w, rope2d_normalized_by_hw, pad_to_multiplier=1, max_frames=128, max_height=2048 // 8, max_width=2048 // 8, base=10000.0, device=None, activated_h_div_w_templates=[], steps_per_frame=4, pn=None, args=None): + # split the dimension into three parts, one for x, one for y, and one for t + assert dim % 2 == 0, f'Only support dim % 2 == 0, but got dim={dim}' + dim_div_2 = dim // 2 + num_of_freqs = int(np.ceil(dim_div_2 / 3)) + inv_freq = 1.0 / (base ** (torch.arange(num_of_freqs, dtype=torch.int64).float().to(device) / num_of_freqs)) # namely theta, 1 / (10000^(i/dim_div_3)), i=0,2,..., dim_div_3-2, totally dim_div_3 / 2 elems + t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq) + t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq) + t_frames = torch.arange(max_frames, device=device, dtype=torch.int64).type_as(inv_freq) + freqs_height = torch.outer(t_height, inv_freq) # (max_height, ceil(dim_div_2 / 3)), namely y*theta + freqs_width = torch.outer(t_width, inv_freq) # (max_width, ceil(dim_div_2 / 3)), namely x*theta + freqs_frames = torch.outer(t_frames, inv_freq) # (max_width, ceil(dim_div_2 / 3)), namely x*theta + if (num_of_freqs*3) - dim_div_2 == 0: + offset_t, offset_h, offset_w = num_of_freqs, num_of_freqs, num_of_freqs + elif (num_of_freqs*3) - dim_div_2 == 2: # 2 elems that should be drop + offset_t, offset_h, offset_w = num_of_freqs, num_of_freqs-1, num_of_freqs-1 + else: # 1 elems that should be drop + offset_t, offset_h, offset_w = num_of_freqs-1, num_of_freqs, num_of_freqs + freqs_grid_map = torch.concat([ + freqs_frames[:, None, None, :offset_t].expand(-1, max_height, max_width, -1), # (max_frames, max_height, max_width, ceil(dim_div_2 / 3)) + freqs_height[None, :, None, :offset_h].expand(max_frames, -1, max_width, -1), # (max_frames, max_height, max_width, ceil(dim_div_2 / 3)) + freqs_width[None, None, :, :offset_w].expand(max_frames, max_height, -1, -1), # (max_frames, max_height, max_width, ceil(dim_div_2 / 3)) + ], dim=-1) # (max_frames, max_height, max_width, dim / 2) + freqs_grid_map = torch.stack([torch.cos(freqs_grid_map), torch.sin(freqs_grid_map)], dim=0) + # (2, max_frames, max_height, max_width, dim / 2) + + rope2d_freqs_grid = {} + for h_div_w in activated_h_div_w_templates: + assert h_div_w in dynamic_resolution_h_w, f'Unknown h_div_w: {h_div_w}' + image_scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['image_scales'] + video_scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['video_scales'] + first_full_spatial_size_scale_index = get_first_full_spatial_size_scale_index(video_scale_schedule) + pt, ph, pw = video_scale_schedule[-1] + rope_cache_list4image, rope_cache_list4video = [], [] + + # image + for si, (pt, ph, pw) in enumerate(image_scale_schedule): + assert pt == 1 + mul_pt_ph_pw = pt * ph * pw + mul_ph_pw = ph * pw + if rope2d_normalized_by_hw == 2: # star style + upt, uph, upw = image_scale_schedule[-1] + t_inds = 0 * torch.ones(pt, ph, pw) + indices = torch.stack([ + t_inds, + (torch.arange(ph) * (uph / ph)).reshape(1, ph, 1).expand(pt, ph, pw), + (torch.arange(pw) * (upw / pw)).reshape(1, 1, pw).expand(pt, ph, pw), + ], dim=-1).round().int() # (pt, ph, pw, 3) + indices = indices.reshape(-1, 3) # (pt*ph*pw, 3) + rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], indices[:,2], :] # (2, pt*ph*pw, dim / 2) + rope_cache = rope_cache.reshape(2, pt, ph, pw, -1) + elif rope2d_normalized_by_hw == 0: + rope_cache = freqs_grid_map[:, :pt, :ph, :pw, :] # (2, pt, ph, pw, dim / 2) + else: + raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}') + rope_cache_list4image.append(rope_cache.reshape(2, mul_ph_pw, -1)) # (2, 1*ph*pw, dim / 2) + + # video + for si, (pt, ph, pw) in enumerate(video_scale_schedule): + mul_pt_ph_pw = pt * ph * pw + mul_ph_pw = ph * pw + if rope2d_normalized_by_hw == 2: # star style + upt, uph, upw = video_scale_schedule[-1] + if args.dynamic_scale_schedule == 'infinity_video_tower': + t_ind = int(np.ceil((si - first_full_spatial_size_scale_index) / steps_per_frame)) + t_ind = max(t_ind, 0) + t_inds = t_ind * torch.ones(pt, ph, pw) + print(f't_ind: {t_ind}, si: {si}, (pt, ph, pw): {(pt, ph, pw)}') + else: + t_inds = (torch.arange(pt)).reshape(pt, 1, 1).expand(pt, ph, pw) + indices = torch.stack([ + t_inds, + (torch.arange(ph) * (uph / ph)).reshape(1, ph, 1).expand(pt, ph, pw), + (torch.arange(pw) * (upw / pw)).reshape(1, 1, pw).expand(pt, ph, pw), + ], dim=-1).round().int() # (pt, ph, pw, 3) + indices = indices.reshape(-1, 3) # (pt*ph*pw, 3) + rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], indices[:,2], :] # (2, pt*ph*pw, dim / 2) + rope_cache = rope_cache.reshape(2, pt, ph, pw, -1) + elif rope2d_normalized_by_hw == 0: + rope_cache = freqs_grid_map[:, :pt, :ph, :pw, :] # (2, pt, ph, pw, dim / 2) + else: + raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}') + rope_cache_list4video.append(rope_cache.reshape(2, mul_pt_ph_pw, -1)) # (2, pt*ph*pw, dim / 2) + cat_rope_cache4image = torch.cat(rope_cache_list4image, 1) # (2, seq_len, dim / 2) + cat_rope_cache4video = torch.cat(rope_cache_list4video, 1) # (2, seq_len, dim / 2) + if cat_rope_cache4image.shape[1] % pad_to_multiplier: + pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache4image.shape[1] % pad_to_multiplier, dim//2) + cat_rope_cache4image = torch.cat([cat_rope_cache4image, pad], dim=1) + if cat_rope_cache4video.shape[1] % pad_to_multiplier: + pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache4video.shape[1] % pad_to_multiplier, dim//2) + cat_rope_cache4video = torch.cat([cat_rope_cache4video, pad], dim=1) + cat_rope_cache4image = cat_rope_cache4image[:,None,None,None] # (2, 1, 1, 1, seq_len, dim / 2) + cat_rope_cache4video = cat_rope_cache4video[:,None,None,None] # (2, 1, 1, 1, seq_len, dim / 2) + rope2d_freqs_grid[str(tuple(image_scale_schedule))] = cat_rope_cache4image + rope2d_freqs_grid[str(tuple(video_scale_schedule))] = cat_rope_cache4video + return rope2d_freqs_grid + + +def precompute_rope4d_freqs_grid( + dim, + rope2d_normalized_by_hw, + pad_to_multiplier=1, + max_scales=128, + max_frames=128, + max_height=2048 // 8, + max_width=2048 // 8, + base=10000.0, + device=None, + activated_h_div_w_templates=[], + steps_per_frame=4, + text_maxlen=0, + pn=None, + args=None, + **kwargs, +): + # split the dimension into three parts, one for x, one for y, and one for t + print(f'[precompute_rope4d_freqs_grid: 4d]: start') + assert dim % 2 == 0, f'Only support dim % 2 == 0, but got dim={dim}' + dim_div_2 = dim // 2 + num_of_freqs = int(np.ceil(dim_div_2 / 4)) + inv_freq = 1.0 / (base ** (torch.arange(num_of_freqs, dtype=torch.int64).float().to(device) / num_of_freqs)) # namely theta, 1 / (10000^(i/dim_div_4)), i=0,2,..., dim_div_4-2, totally dim_div_4 / 2 elems + t_scales = torch.arange(text_maxlen+max_scales, device=device, dtype=torch.int64).type_as(inv_freq) + t_frames = torch.arange(max_frames, device=device, dtype=torch.int64).type_as(inv_freq) + t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq) + t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq) + freqs_scales = torch.outer(t_scales, inv_freq) # (text_maxlen+max_scales, ceil(dim_div_2 / 4)), namely x*theta + freqs_frames = torch.outer(t_frames, inv_freq) # (max_frames, ceil(dim_div_2 / 4)), namely x*theta + freqs_height = torch.outer(t_height, inv_freq) # (max_height, ceil(dim_div_2 / 4)), namely y*theta + freqs_width = torch.outer(t_width, inv_freq) # (max_width, ceil(dim_div_2 / 4)), namely x*theta + assert num_of_freqs*4==dim_div_2 + freqs_scales = torch.stack([torch.cos(freqs_scales), torch.sin(freqs_scales)], dim=0) + freqs_frames = torch.stack([torch.cos(freqs_frames), torch.sin(freqs_frames)], dim=0) + freqs_height = torch.stack([torch.cos(freqs_height), torch.sin(freqs_height)], dim=0) + freqs_width = torch.stack([torch.cos(freqs_width), torch.sin(freqs_width)], dim=0) + tm = text_maxlen + rope_text_embeds = torch.cat([ + freqs_scales[ :, :tm, None, None, None, :].expand(-1, -1, -1, -1, -1, -1), + freqs_frames[ :, None, :1, None, None, :].expand(-1, tm, -1, -1, -1, -1), + freqs_height[ :, None, None, :1, None, :].expand(-1, tm, -1, -1, -1, -1), + freqs_width[ :, None, None, None, :1, :].expand(-1, tm, -1, -1, -1, -1), + ], dim=-1) # (2, tm, 1, 1, 1, dim_div_2) + rope_text_embeds = rope_text_embeds.reshape(2, 1, 1, 1, tm, dim_div_2) + rope2d_freqs_grid = {} + rope2d_freqs_grid['freqs_text'] = rope_text_embeds # (2, 1, 1, 1, text_maxlen, dim / 2) + rope2d_freqs_grid['freqs_scales'] = freqs_scales[:, tm:] # (2, max_scales, ceil(dim_div_2 / 4)) + rope2d_freqs_grid['freqs_frames'] = freqs_frames # (2, max_frames, ceil(dim_div_2 / 4)) + rope2d_freqs_grid['freqs_height'] = freqs_height # (2, max_height, ceil(dim_div_2 / 4)) + rope2d_freqs_grid['freqs_width'] = freqs_width # (2, max_width, ceil(dim_div_2 / 4)) + return rope2d_freqs_grid + +def apply_rotary_emb(q, k, rope_cache): + device_type = q.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + qk = [q, k] + rope_cache = rope_cache[:,0] + with torch.autocast(device_type=device_type, enabled=False): + for i in range(2): + qk[i] = qk[i].reshape(*qk[i].shape[:-1], -1, 2) + tmp1 = qk[i][..., 1] * rope_cache[1] + tmp2 = qk[i][..., 0] * rope_cache[1] + qk[i][..., 0].mul_(rope_cache[0]).sub_(tmp1) + qk[i][..., 1].mul_(rope_cache[0]).add_(tmp2) + qk[i] = qk[i].reshape(*qk[i].shape[:-2], -1) + q, k = qk + # qk = qk.reshape(*qk.shape[:-1], -1, 2) #(2, batch_size, heads, seq_len, half_head_dim, 2) + # qk = torch.stack([ + # qk[...,0] * rope_cache[0] - qk[...,1] * rope_cache[1], + # qk[...,0] * rope_cache[1] + qk[...,1] * rope_cache[0], + # ], dim=-1) # (2, batch_size, heads, seq_len, half_head_dim, 2), here stack + reshape should not be concate + # qk = qk.reshape(*qk.shape[:-2], -1) #(2, batch_size, heads, seq_len, head_dim) + # q, k = qk.unbind(dim=0) # (batch_size, heads, seq_len, head_dim) + return q, k \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/rope_inplace.py b/Meissonic/InfinityStar/infinity/models/rope_inplace.py new file mode 100644 index 0000000000000000000000000000000000000000..ab327bc5d0e6ea1c4153129d247dbacd7b03fdc4 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/rope_inplace.py @@ -0,0 +1,179 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import math +import os +from functools import partial +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from timm.models.layers import DropPath, drop_path +from torch.utils.checkpoint import checkpoint + + +def precompute_rope2d_freqs_grid(dim, dynamic_resolution_h_w, rope2d_normalized_by_hw, pad_to_multiplier=1, max_height=2048 // 16, max_width=2048 // 16, base=10000.0, device=None, scaling_factor=1.0): + # split the dimension into half, one for x and one for y + half_dim = dim // 2 + inv_freq = 1.0 / (base ** (torch.arange(0, half_dim, 2, dtype=torch.int64).float().to(device) / half_dim)) # namely theta, 1 / (10000^(i/half_dim)), i=0,2,..., half_dim-2 + t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq) + t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq) + t_height = t_height / scaling_factor + freqs_height = torch.outer(t_height, inv_freq) # (max_height, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely y*theta + t_width = t_width / scaling_factor + freqs_width = torch.outer(t_width, inv_freq) # (max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely x*theta + freqs_grid_map = torch.concat([ + freqs_height[:, None, :].expand(-1, max_width, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2) + freqs_width[None, :, :].expand(max_height, -1, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2) + ], dim=-1) # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d)) + freqs_grid_map = torch.stack([torch.cos(freqs_grid_map), torch.sin(freqs_grid_map)], dim=0) + # (2, max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d)) + + rope2d_freqs_grid = {} + for h_div_w in dynamic_resolution_h_w: + scale_schedule = dynamic_resolution_h_w[h_div_w]['1M']['scales'] + _, ph, pw = scale_schedule[-1] + max_edge_length = freqs_grid_map.shape[1] + if ph >= pw: + uph, upw = max_edge_length, int(max_edge_length / ph * pw) + else: + uph, upw = int(max_edge_length / pw * ph), max_edge_length + rope_cache_list = [] + for (_, ph, pw) in scale_schedule: + ph_mul_pw = ph * pw + if rope2d_normalized_by_hw == 1: # downsample + rope_cache = F.interpolate(freqs_grid_map[:, :uph, :upw, :].permute([0,3,1,2]), size=(ph, pw), mode='bilinear', align_corners=True) + rope_cache = rope_cache.permute([0,2,3,1]) # (2, ph, pw, half_head_dim) + elif rope2d_normalized_by_hw == 2: # star stylee + _, uph, upw = scale_schedule[-1] + indices = torch.stack([ + (torch.arange(ph) * (uph / ph)).reshape(ph, 1).expand(ph, pw), + (torch.arange(pw) * (upw / pw)).reshape(1, pw).expand(ph, pw), + ], dim=-1).round().int() # (ph, pw, 2) + indices = indices.reshape(-1, 2) # (ph*pw, 2) + rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], :] # (2, ph*pw, half_head_dim) + rope_cache = rope_cache.reshape(2, ph, pw, -1) + elif rope2d_normalized_by_hw == 0: + rope_cache = freqs_grid_map[:, :ph, :pw, :] # (2, ph, pw, half_head_dim) + else: + raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}') + rope_cache_list.append(rope_cache.reshape(2, ph_mul_pw, -1)) + cat_rope_cache = torch.cat(rope_cache_list, 1) # (2, seq_len, half_head_dim) + if cat_rope_cache.shape[1] % pad_to_multiplier: + pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache.shape[1] % pad_to_multiplier, half_dim) + cat_rope_cache = torch.cat([cat_rope_cache, pad], dim=1) + cat_rope_cache = cat_rope_cache[:,None,None,None] # (2, 1, 1, 1, seq_len, half_dim) + for pn in dynamic_resolution_h_w[h_div_w]: + scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['scales'] + tmp_scale_schedule = [(1, h, w) for _, h, w in scale_schedule] + rope2d_freqs_grid[str(tuple(tmp_scale_schedule))] = cat_rope_cache + return rope2d_freqs_grid + + +def precompute_rope3d_freqs_grid(dim, dynamic_resolution_h_w, rope2d_normalized_by_hw, pad_to_multiplier=1, max_frames=129 // 4, max_height=2048 // 8, max_width=2048 // 8, base=10000.0, device=None): + # split the dimension into three parts, one for x, one for y, and one for t + assert dim % 2 == 0, f'Only support dim % 2 == 0, but got dim={dim}' + dim_div_2 = dim // 2 + num_of_freqs = int(np.ceil(dim_div_2 / 3)) + inv_freq = 1.0 / (base ** (torch.arange(num_of_freqs, dtype=torch.int64).float().to(device) / num_of_freqs)) # namely theta, 1 / (10000^(i/dim_div_3)), i=0,2,..., dim_div_3-2, totally dim_div_3 / 2 elems + t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq) + t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq) + t_frames = torch.arange(max_frames, device=device, dtype=torch.int64).type_as(inv_freq) + freqs_height = torch.outer(t_height, inv_freq) # (max_height, ceil(dim_div_2 / 3)), namely y*theta + freqs_width = torch.outer(t_width, inv_freq) # (max_width, ceil(dim_div_2 / 3)), namely x*theta + freqs_frames = torch.outer(t_frames, inv_freq) # (max_width, ceil(dim_div_2 / 3)), namely x*theta + if (num_of_freqs*3) - dim_div_2 == 0: + offset_t, offset_h, offset_w = num_of_freqs, num_of_freqs, num_of_freqs + elif (num_of_freqs*3) - dim_div_2 == 2: # 2 elems that should be drop + offset_t, offset_h, offset_w = num_of_freqs, num_of_freqs-1, num_of_freqs-1 + else: # 1 elems that should be drop + offset_t, offset_h, offset_w = num_of_freqs-1, num_of_freqs, num_of_freqs + freqs_grid_map = torch.concat([ + freqs_frames[:, None, None, :offset_t].expand(-1, max_height, max_width, -1), # (max_frames, max_height, max_width, ceil(dim_div_2 / 3)) + freqs_height[None, :, None, :offset_h].expand(max_frames, -1, max_width, -1), # (max_frames, max_height, max_width, ceil(dim_div_2 / 3)) + freqs_width[None, None, :, :offset_w].expand(max_frames, max_height, -1, -1), # (max_frames, max_height, max_width, ceil(dim_div_2 / 3)) + ], dim=-1) # (max_frames, max_height, max_width, dim / 2) + freqs_grid_map = torch.stack([torch.cos(freqs_grid_map), torch.sin(freqs_grid_map)], dim=0) + # (2, max_frames, max_height, max_width, dim / 2) + + rope2d_freqs_grid = {} + for h_div_w in dynamic_resolution_h_w: + scale_schedule = dynamic_resolution_h_w[h_div_w]['1M']['scales'] + pt, ph, pw = scale_schedule[-1] + rope_cache_list4image, rope_cache_list4video = [], [] + for (pt, ph, pw) in scale_schedule: + mul_pt_ph_pw = pt * ph * pw + mul_ph_pw = ph * pw + if rope2d_normalized_by_hw == 2: # star stylee + upt, uph, upw = scale_schedule[-1] + indices = torch.stack([ + (torch.arange(pt) * (upt / pt)).reshape(pt, 1, 1).expand(pt, ph, pw), + (torch.arange(ph) * (uph / ph)).reshape(1, ph, 1).expand(pt, ph, pw), + (torch.arange(pw) * (upw / pw)).reshape(1, 1, pw).expand(pt, ph, pw), + ], dim=-1).round().int() # (pt, ph, pw, 3) + indices = indices.reshape(-1, 3) # (pt*ph*pw, 3) + rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], indices[:,2], :] # (2, pt*ph*pw, dim / 2) + rope_cache = rope_cache.reshape(2, pt, ph, pw, -1) + elif rope2d_normalized_by_hw == 0: + rope_cache = freqs_grid_map[:, :pt, :ph, :pw, :] # (2, pt, ph, pw, dim / 2) + else: + raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}') + rope_cache_list4image.append(rope_cache[:,:1].reshape(2, mul_ph_pw, -1)) # (2, 1*ph*pw, dim / 2) + rope_cache_list4video.append(rope_cache.reshape(2, mul_pt_ph_pw, -1)) # (2, pt*ph*pw, dim / 2) + cat_rope_cache4image = torch.cat(rope_cache_list4image, 1) # (2, seq_len, dim / 2) + cat_rope_cache4video = torch.cat(rope_cache_list4video, 1) # (2, seq_len, dim / 2) + if cat_rope_cache4image.shape[1] % pad_to_multiplier: + pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache4image.shape[1] % pad_to_multiplier, dim//2) + cat_rope_cache4image = torch.cat([cat_rope_cache4image, pad], dim=1) + if cat_rope_cache4video.shape[1] % pad_to_multiplier: + pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache4video.shape[1] % pad_to_multiplier, dim//2) + cat_rope_cache4video = torch.cat([cat_rope_cache4video, pad], dim=1) + cat_rope_cache4image = cat_rope_cache4image[:,None,None,None] # (2, 1, 1, 1, seq_len, dim / 2) + cat_rope_cache4video = cat_rope_cache4video[:,None,None,None] # (2, 1, 1, 1, seq_len, dim / 2) + for pn in dynamic_resolution_h_w[h_div_w]: + video_scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['scales'] + image_scale_schedule = [(1, ph, pw) for pt, ph, pw in video_scale_schedule] + rope2d_freqs_grid[str(tuple(video_scale_schedule))] = cat_rope_cache4video + rope2d_freqs_grid[str(tuple(image_scale_schedule))] = cat_rope_cache4image + return rope2d_freqs_grid + + +def apply_rotary_emb(q, k, scale_schedule, rope2d_freqs_grid, scale_ind): + # qk = torch.stack((q, k), dim=0) #(2, batch_size, heads, seq_len, head_dim) + device_type = q.device.type # (batch_size, heads, seq_len, head_dim)) + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + seq_len = q.shape[2] + start = 0 + if scale_ind >= 1: + assert len(scale_schedule[0]) == 3 + start = np.sum([item[0] * item[1] * item[2] for item in scale_schedule[:scale_ind]]) + try: + rope2d_freqs_grid[str(tuple(scale_schedule))] = rope2d_freqs_grid[str(tuple(scale_schedule))].to(q.device) + except: + print(rope2d_freqs_grid.keys()) + print(str(tuple(scale_schedule))) + import pdb; pdb.set_trace() + assert start+seq_len <= rope2d_freqs_grid[str(tuple(scale_schedule))].shape[4] + rope_cache = rope2d_freqs_grid[str(tuple(scale_schedule))][:, 0, :, :, start:start+seq_len] # rope_cache shape: [2, 1, 1, seq_len, dim] + + qk = [q, k] + for i in range(len(qk)): + qk[i] = qk[i].reshape(*qk[i].shape[:-1], -1, 2) #(batch_size, heads, seq_len, half_head_dim, 2) + # qk = torch.stack([ + # rope_cache[0] * qk[...,0] - rope_cache[1] * qk[...,1], + # rope_cache[1] * qk[...,0] + rope_cache[0] * qk[...,1], + # ], dim=-1) # (2, batch_size, heads, seq_len, half_head_dim, 2), here stack + reshape should not be concate + + # this is the in-place version to save vRAM + tmp1 = qk[i][..., 1] * rope_cache[1] + tmp2 = qk[i][..., 0] * rope_cache[1] + qk[i][..., 0].mul_(rope_cache[0]).sub_(tmp1) + qk[i][..., 1].mul_(rope_cache[0]).add_(tmp2) + + q, k = qk + q = q.reshape(*q.shape[:-2], -1) + k = k.reshape(*k.shape[:-2], -1) + return q, k \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/self_correction.py b/Meissonic/InfinityStar/infinity/models/self_correction.py new file mode 100644 index 0000000000000000000000000000000000000000..6634eafe5444bae896265ec6ffde104c187d6b5b --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/self_correction.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +import os.path as osp + +import cv2 +import torch +import torch.nn.functional as F +import numpy as np + +from infinity.schedules.dynamic_resolution import get_first_full_spatial_size_scale_index + + +def labels2image(all_indices, label_type='int_label', scale_schedule=None): + summed_codes, recons_imgs = self.vae.decode_from_indices(all_indices, scale_schedule, label_type) + recons_img = recons_imgs[0] + recons_img = (recons_img + 1) / 2 + recons_img = recons_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)[:,:,::-1] + return recons_img + +def features2image(raw_features): + recons_imgs = self.vae.decode(raw_features.squeeze(-3)) + recons_img = recons_imgs[0] + recons_img = (recons_img + 1) / 2 + recons_img = recons_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)[:,:,::-1] + return recons_img + +class SelfCorrection(object): + def __init__(self, vae, args): + self.noise_apply_layers = args.noise_apply_layers + self.noise_apply_requant = args.noise_apply_requant + self.noise_apply_strength = args.noise_apply_strength + if not isinstance(self.noise_apply_strength, list): + self.noise_apply_strength = str(self.noise_apply_strength) + self.noise_apply_strength = list(map(float, self.noise_apply_strength.split(','))) + if len(self.noise_apply_strength) == 1: + self.noise_apply_strength = self.noise_apply_strength[0] + self.apply_spatial_patchify = args.apply_spatial_patchify + self.vae = vae + print(f'self.noise_apply_strength: {self.noise_apply_strength}') + + def apply_noise_requant(self, bit_indices, quantized, args, device, si, lfq=None, noise_apply_strength=None): + if lfq is None: + lfq = self.vae.quantizer.lfq + if noise_apply_strength is None: + noise_apply_strength = self.noise_apply_strength + if isinstance(noise_apply_strength, list): + noise_apply_strength = np.random.randint(0, max(1, 100 * noise_apply_strength[si]+1)) * 0.01 + else: + noise_apply_strength = np.random.randint(0, max(1, 100 * noise_apply_strength+1)) * 0.01 + mask = torch.rand(*bit_indices.shape, device=device) < noise_apply_strength + pred_bit_indices = bit_indices.clone() + if args.num_of_label_value == 2: + pred_bit_indices[mask] = 1 - pred_bit_indices[mask] + else: + noise = torch.randint(0, args.num_of_label_value, bit_indices.shape, dtype=bit_indices.dtype, device=device) + pred_bit_indices[mask] = noise[mask] + if self.noise_apply_requant: + quantized = lfq.indices_to_codes(pred_bit_indices, label_type = 'bit_label') + return pred_bit_indices, quantized + + def visualize(self, vae_scale_schedule, inp_B3HW, gt_all_bit_indices, pred_all_bit_indices): + gt_img = (inp_B3HW.squeeze(-3) + 1) / 2 * 255 + gt_img = gt_img[0].permute(1,2,0).cpu().numpy().astype(np.uint8)[:,:,::-1] + recons_img_2 = labels2image(gt_all_bit_indices, label_type='bit_label', scale_schedule=vae_scale_schedule) + recons_img_3 = labels2image(pred_all_bit_indices, label_type='bit_label', scale_schedule=vae_scale_schedule) + cat_image = np.concatenate([gt_img, recons_img_2, recons_img_3], axis=1) + save_path = osp.abspath('non_teacher_force.jpg') + cv2.imwrite(save_path, cat_image) + print(f'Save to {save_path}') + import pdb; pdb.set_trace() + print(cat_image.shape) + \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/__init__.py b/Meissonic/InfinityStar/infinity/models/videovae/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68af73834914377d02a72e2a8b5c04781718ac --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b29b9fd077bc3b9c5980c1297206db8f9cdfd43 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/models/__init__.py b/Meissonic/InfinityStar/infinity/models/videovae/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68af73834914377d02a72e2a8b5c04781718ac --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/models/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b8c17a7eccfe27d642245dafffb10d6f81579ac Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/load_vae_bsq_wan_absorb_patchify.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/load_vae_bsq_wan_absorb_patchify.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c8f3d3ff87cac30adcded3c45f94ce80b095561 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/load_vae_bsq_wan_absorb_patchify.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/wan_bsq_vae.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/wan_bsq_vae.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d5cf26db3d7bd89390a469e8ce3b8e5daf473ec Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/models/__pycache__/wan_bsq_vae.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/models/load_vae_bsq_wan_absorb_patchify.py b/Meissonic/InfinityStar/infinity/models/videovae/models/load_vae_bsq_wan_absorb_patchify.py new file mode 100644 index 0000000000000000000000000000000000000000..b17999b5a7c7aad55fa257b2065dea16926e53d6 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/models/load_vae_bsq_wan_absorb_patchify.py @@ -0,0 +1,266 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +os.environ['XFORMERS_FORCE_DISABLE_TRITON'] = '1' +import argparse +import torch + +from infinity.models.videovae.models.wan_bsq_vae import AutoencoderKLCogVideoX + +def video_vae_model(vqgan_ckpt, schedule_mode, codebook_dim, global_args=None, test_mode=True): + args=argparse.Namespace( + vqgan_ckpt=vqgan_ckpt, + sd_ckpt=None, + use_frames=None, + inference_type='video', + save_prediction=True, + save_dir='results', + intermediate_tensor=True, + save_z=False, + save_frames=False, + image_recon4video=False, + junke_old=False, + cal_norm=False, + save_samples=None, + device='cuda', + noise_scale=0.0, + max_steps=1000000.0, + log_every=1, + ckpt_every=1000, + default_root_dir='/tmp', + compile='no', + ema='no', + mfu_logging='no', + dataloader_init_epoch=-1, + context_parallel_size=0, + video_ranks_ratio=-1.0, + lr=0.0001, + beta1=0.9, + beta2=0.95, + optim_type='Adam', + disc_optim_type=None, + max_grad_norm=1.0, + max_grad_norm_disc=1.0, + disable_sch=False, + scheduler='no', + warmup_steps=0, + lr_min=0.0, + warmup_lr_init=0.0, + patch_size=8, + temporal_patch_size=4, + embedding_dim=256, + codebook_dim=16, + use_vae=True, + eq_scale_prior=0.0, + eq_angle_prior=0.0, + use_stochastic_depth=False, + drop_rate=0.0, + schedule_mode=schedule_mode, + lr_drop=None, + lr_drop_rate=0.1, + keep_first_quant=False, + keep_last_quant=False, + remove_residual_detach=False, + use_out_phi=False, + use_out_phi_res=False, + use_lecam_reg=False, + lecam_weight=0.05, + perceptual_model='vgg16', + base_ch_disc=64, + random_flip=False, + flip_prob=0.5, + flip_mode='stochastic', + max_flip_lvl=1, + not_load_optimizer=False, + use_lecam_reg_zero=False, + freeze_encoder=False, + rm_downsample=False, + random_flip_1lvl=False, + flip_lvl_idx=0, + drop_when_test=False, + drop_lvl_idx=None, + drop_lvl_num=0, + compute_all_commitment=False, + disable_codebook_usage=False, + freeze_enc_main=False, + freeze_dec_main=False, + random_short_schedule=False, + short_schedule_prob=0.5, + use_bernoulli=False, + use_rot_trick=False, + disable_flip_prob=0.0, + dino_disc=False, + quantizer_type='MultiScaleBSQTP', + lfq_weight=0.0, + entropy_loss_weight=0.1, + visu_every=1000, + commitment_loss_weight=0.25, + bsq_version='v1', + diversity_gamma=1, + bs1_for1024=False, + casual_multi_scale=False, + double_compress_t=False, + temporal_slicing=False, + latent_adjust_type=None, + compute_latent_loss=False, + latent_loss_weight=0.0, + use_raw_latentz=False, + last_scale_repeat_n=0, + num_lvl_fsq=5, + use_midscale_sup=False, + midscale_list=[0.5, 0.75, 1.0], + use_eq=False, + eq_prob=0.5, + disc_version='v1', + magvit_disc=False, + disc_type='patchgan', + sigmoid_in_disc=False, + activation_in_disc='leaky_relu', + apply_blur=False, + apply_noise=False, + dis_warmup_steps=0, + dis_lr_multiplier=1.0, + dis_minlr_multiplier=False, + disc_channels=64, + disc_layers=3, + discriminator_iter_start=0, + disc_pretrain_iter=0, + disc_optim_steps=1, + disc_warmup=0, + disc_pool='no', + disc_pool_size=100, + disc_temporal_compress='yes', + disc_use_blur='yes', + disc_stylegan_downsample_base=2, + fix_model=['no'], + recon_loss_type='l1', + image_gan_weight=1.0, + video_gan_weight=1.0, + image_disc_weight=0.0, + video_disc_weight=0.0, + vf_weight=0.0, + vf_weight_approx=-1, + vf_distmat_margin=0.25, + vf_cos_margin=0.5, + temporal_alignment=None, + l1_weight=4.0, + gan_feat_weight=0.0, + lpips_model='vgg', + perceptual_weight=0.0, + video_perceptual_weight=None, + video_perceptual_layers=[], + kl_weight=0.0, + norm_type='rms', + disc_loss_type='hinge', + gan_image4video='yes', + use_checkpoint=False, + precision='fp32', + encoder_dtype='fp32', + decoder_dtype='fp32', + upcast_attention='', + upcast_tf32=False, + tokenizer='cogvideoxd', + pretrained=None, + pretrained_mode='full', + pretrained_ema='no', + inflation_pe=False, + init_vgen='no', + no_init_idis=False, + init_idis='keep', + init_vdis='no', + enable_nan_detector=False, + turn_on_profiler=False, + profiler_scheduler_wait_steps=10, + debug=False, + video_logger=False, + bytenas='sg', + username='bin.yan', + seed=1234, + vq_to_vae=False, + load_not_strict=False, + zero=0, + bucket_cap_mb=40, + manual_gc_interval=10000, + data_path=[''], + data_type=[''], + dataset_list=['wanxvideo-v1'], + fps=[-1], + dataaug='resizecrop', + multi_resolution=False, + random_bucket_ratio=0.0, + sequence_length=81, + resolution=[(480, 864)], + resize_bucket=None, + resize_bucket_use_self='yes', + scaling_aug='no', + batch_size=[1], + num_workers=0, + image_channels=3, + in_channels=3, + out_channels=3, + down_block_types=['CogVideoXDownBlock3D', + 'CogVideoXDownBlock3D', + 'CogVideoXDownBlock3D', + 'CogVideoXDownBlock3D'], + down_block_mode='dc', + up_block_types=['CogVideoXUpBlock3D', + 'CogVideoXUpBlock3D', + 'CogVideoXUpBlock3D', + 'CogVideoXUpBlock3D'], + up_block_mode='dc', + block_out_channels=[96, 192, 384, 384, 384], + layers_per_block=2, + latent_channels=16, + act_fn='silu', + norm_eps=1e-06, + norm_num_groups=32, + spatial_compression_list=[2, 2, 2], + temporal_compression_list=[2, 2], + sample_height=480, + sample_width=720, + use_quant_conv=False, + use_post_quant_conv=False, + down_layer='3d-dc', + down_norm=True, + up_layer='3d-dc', + up_norm=True, + pad_mode='constant', + dropout_z=0.0, + flux_weight=0, + cycle_weight=0, + cycle_feat_weight=0, + cycle_gan_weight=0, + cycle_loop=0, + cycle_norm='no', + cycle_deterministic='no', + cycle_kl_weight=0, + z_drop=0.0, + intermediate_tensor_dir='/tmp', + codebook_dim_low=codebook_dim//4, + freeze_decoder=False, + semantic_scale_dim=global_args.semantic_scale_dim, + detail_scale_dim=global_args.detail_scale_dim, + use_learnable_dim_proj=global_args.use_learnable_dim_proj, + detail_scale_min_tokens=global_args.detail_scale_min_tokens, + use_feat_proj=global_args.use_feat_proj, + semantic_scales=global_args.semantic_scales, + use_multi_scale=0, + quant_not_rely_256=0, + semantic_num_lvl=2, + detail_num_lvl=2, + ) + + vae = AutoencoderKLCogVideoX(args) + state_dict = torch.load(args.vqgan_ckpt, map_location=torch.device("cpu"), weights_only=True) + if args.ema == "yes": + print("testing ema weights") + vae.load_state_dict(state_dict["ema"], strict=False) + else: + vae.load_state_dict(state_dict["vae"], strict=False) + + vae.enable_slicing() + if test_mode: + vae.eval() + [p.requires_grad_(False) for p in vae.parameters()] + return vae \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/models/wan_bsq_vae.py b/Meissonic/InfinityStar/infinity/models/videovae/models/wan_bsq_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..e2d31ee58cc9445152d53387e325aa8958ddb56e --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/models/wan_bsq_vae.py @@ -0,0 +1,1986 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +from typing import Dict, Optional, Tuple, Union +import math +import numpy as np +from einops import rearrange +import argparse +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +import timm + +from infinity.models.videovae.modules import DiagonalGaussianDistribution +from infinity.models.videovae.utils.misc import ptdtype +from infinity.models.videovae.modules.quantizer import MultiScaleBSQTP_AP as MultiScaleBSQTP_AP +from infinity.models.videovae.modules.quantizer import MultiScaleFSQTP +from infinity.models.videovae.modules.conv_wan import DCDownBlock2d, DCUpBlock2d, DCDownBlock3d, DCUpBlock3d, CogVideoXCausalConv3d, CogVideoXSafeConv3d +from infinity.models.videovae.modules.normalization_wan import get_norm +from infinity.models.videovae.utils.context_parallel import ContextParallelUtils as cp +from infinity.models.videovae.utils.context_parallel import dist_decoder_gather_result, dist_encoder_gather_result +from infinity.models.videovae.utils.dynamic_resolution_two_pyramid import get_ratio2hws_video_v2 + + +def patchify(item): + assert item.ndim == 5 + # (B,c,t,H,W) -> (B,t,c,H,W) -> (B,t,4c,H/2,W/2) -> (B,4c,t,H/2,W/2) + item = torch.nn.functional.pixel_unshuffle(item.permute(0,2,1,3,4), 2).permute(0,2,1,3,4) + return item + +def unpatchify(item): + assert item.ndim == 5 + item = item.permute(0,2,1,3,4) # (B,4c,t,H/2,W/2) -> [B, t, 4c, H/2, W/2] + item = torch.nn.functional.pixel_shuffle(item, 2) # [B, t, 4c, H/2, W/2] -> [B, t, c, H, W] + item = item.permute(0,2,1,3,4) # [B, t, c, H, W] -> [B, c, t, H, W] + return item + +class CogVideoXDownsample3D(nn.Module): + # Todo: Wait for paper relase. + r""" + A 3D Downsampling layer using in [CogVideoX]() by Tsinghua University & ZhipuAI + + Args: + in_channels (`int`): + Number of channels in the input image. + out_channels (`int`): + Number of channels produced by the convolution. + kernel_size (`int`, defaults to `3`): + Size of the convolving kernel. + stride (`int`, defaults to `2`): + Stride of the convolution. + padding (`int`, defaults to `0`): + Padding added to all four sides of the input. + compress_time (`bool`, defaults to `False`): + Whether or not to compress the time dimension. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 2, + padding: int = 0, + compress_time = None, + down_layer = "conv", + down_norm = False, + pad_mode = "constant", + norm_type=None, + ): + super().__init__() + + self.pad_mode = pad_mode + self.down_layer = down_layer + if down_layer == "conv": + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) + elif down_layer == "dc": + self.conv = DCDownBlock2d(in_channels, out_channels, downsample=True, shortcut=True, pad_mode=pad_mode, group_norm=down_norm) + elif down_layer == "3d-dc": + self.conv = DCDownBlock3d(in_channels, out_channels, group_norm=down_norm, compress_time=compress_time, pad_mode=pad_mode, norm_type=norm_type) + self.compress_time = compress_time + + def forward(self, x: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + if self.down_layer == "3d-dc": + x, new_conv_cache = self.conv(x, conv_cache=conv_cache) + else: + if self.compress_time == 2: + batch_size, channels, frames, height, width = x.shape + + # (batch_size, channels, frames, height, width) -> (batch_size, height, width, channels, frames) -> (batch_size * height * width, channels, frames) + x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames) + + if x.shape[-1] % 2 == 1: + x_first, x_rest = x[..., 0], x[..., 1:] + if x_rest.shape[-1] > 0: + # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2) + x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2) + + x = torch.cat([x_first[..., None], x_rest], dim=-1) + # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width) + x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2) + else: + # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2) + x = F.avg_pool1d(x, kernel_size=2, stride=2) + # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width) + x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2) + elif self.compress_time == 3: + batch_size, channels, frames, height, width = x.shape + x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames) + + if x.shape[-1] % 2 == 1: + x_first, x_rest = x[..., 0], x[..., 1:] + if x_rest.shape[-1] > 0: + x_rest = F.avg_pool1d(x_rest, kernel_size=3, stride=3) + + x = torch.cat([x_first[..., None], x_rest], dim=-1) + # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width) + x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2) + else: + # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2) + x = F.avg_pool1d(x, kernel_size=3, stride=3) + # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width) + x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2) + + # Pad the tensor + if self.down_layer == "conv": + pad = (0, 1, 0, 1) + if self.pad_mode == "constant": + x = F.pad(x, pad, mode="constant", value=0) + else: + _shape = x.shape + x = F.pad(x, pad, mode="replicate") + inputs = inputs.view(*_shape[:-2], *inputs.shape[-2:]) + + batch_size, channels, frames, height, width = x.shape + # (batch_size, channels, frames, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size * frames, channels, height, width) + x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channels, height, width) + x = self.conv(x) + # (batch_size * frames, channels, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size, channels, frames, height, width) + x = x.reshape(batch_size, frames, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4) + return x, new_conv_cache + + +class CogVideoXUpsample3D(nn.Module): + r""" + A 3D Upsample layer using in CogVideoX by Tsinghua University & ZhipuAI # Todo: Wait for paper relase. + + Args: + in_channels (`int`): + Number of channels in the input image. + out_channels (`int`): + Number of channels produced by the convolution. + kernel_size (`int`, defaults to `3`): + Size of the convolving kernel. + stride (`int`, defaults to `1`): + Stride of the convolution. + padding (`int`, defaults to `1`): + Padding added to all four sides of the input. + compress_time (`bool`, defaults to `False`): + Whether or not to compress the time dimension. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 1, + padding: int = 1, + compress_time = None, + up_layer = "conv", + up_norm = False, + norm_type = None, + pad_mode = "constant", + ) -> None: + super().__init__() + + self.up_layer = up_layer + if up_layer == "conv": + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) + elif up_layer == "dc": + self.conv = DCUpBlock2d(in_channels, out_channels, interpolate=False, shortcut=True, group_norm=up_norm, norm_type=norm_type, pad_mode=pad_mode) + elif up_layer == "3d-dc": + self.conv = DCUpBlock3d(in_channels, out_channels, group_norm=up_norm, compress_time=compress_time, norm_type=norm_type, pad_mode=pad_mode) + self.compress_time = compress_time + + def forward(self, inputs: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None, split_first=False) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + if self.up_layer == "3d-dc": + inputs, new_conv_cache = self.conv(inputs, conv_cache=conv_cache, split_first=split_first) + else: + raise NotImplementedError + if self.up_layer == "conv": + spatial_scale = (2., 2.) + elif self.up_layer == "dc": + spatial_scale = (1., 1.) + if self.compress_time: + temporal_scale = (float(self.compress_time), *spatial_scale) + if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1: + # split first frame + x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:] + x_first = F.interpolate(x_first, scale_factor=spatial_scale) + x_rest = F.interpolate(x_rest, scale_factor=temporal_scale) + x_first = x_first[:, :, None, :, :] + inputs = torch.cat([x_first, x_rest], dim=2) + elif inputs.shape[2] > 1: + inputs = F.interpolate(inputs, scale_factor=temporal_scale) + else: + inputs = inputs.squeeze(2) + inputs = F.interpolate(inputs, scale_factor=spatial_scale) + inputs = inputs[:, :, None, :, :] + else: + # only interpolate 2D + b, c, t, h, w = inputs.shape + inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w) + inputs = F.interpolate(inputs, scale_factor=spatial_scale) + inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4) + + b, c, t, h, w = inputs.shape + inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w) + inputs = self.conv(inputs) + inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4) + return inputs, new_conv_cache + +class CogVideoXSpatialNorm3D(nn.Module): + r""" + Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002. This implementation is specific + to 3D-video like data. + + CogVideoXSafeConv3d is used instead of nn.Conv3d to avoid OOM in CogVideoX Model. + + Args: + f_channels (`int`): + The number of channels for input to group normalization layer, and output of the spatial norm layer. + zq_channels (`int`): + The number of channels for the quantized vector as described in the paper. + groups (`int`): + Number of groups to separate the channels into for group normalization. + """ + + def __init__( + self, + f_channels: int, + zq_channels: int, + groups: int = 32, + norm_type = None, + pad_mode = "constant" + ): + super().__init__() + norm_layer = get_norm(norm_type) + self.norm_layer = norm_layer(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True) + self.conv_y = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1, pad_mode=pad_mode) + self.conv_b = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1, pad_mode=pad_mode) + + def forward( + self, f: torch.Tensor, zq: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None + ) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + if f.shape[2] > 1 and f.shape[2] % 2 == 1: + f_first, f_rest = f[:, :, :1], f[:, :, 1:] + f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:] + z_first, z_rest = zq[:, :, :1], zq[:, :, 1:] + z_first = F.interpolate(z_first, size=f_first_size) + z_rest = F.interpolate(z_rest, size=f_rest_size) + zq = torch.cat([z_first, z_rest], dim=2) + else: + zq = F.interpolate(zq, size=f.shape[-3:]) + + conv_y, new_conv_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y")) + conv_b, new_conv_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b")) + + norm_f = self.norm_layer(f) + new_f = norm_f * conv_y + conv_b + return new_f, new_conv_cache + + +class CogVideoXResnetBlock3D(nn.Module): + r""" + A 3D ResNet block used in the CogVideoX model. + + Args: + in_channels (`int`): + Number of input channels. + out_channels (`int`, *optional*): + Number of output channels. If None, defaults to `in_channels`. + dropout (`float`, defaults to `0.0`): + Dropout rate. + temb_channels (`int`, defaults to `512`): + Number of time embedding channels. + groups (`int`, defaults to `32`): + Number of groups to separate the channels into for group normalization. + eps (`float`, defaults to `1e-6`): + Epsilon value for normalization layers. + conv_shortcut (bool, defaults to `False`): + Whether or not to use a convolution shortcut. + spatial_norm_dim (`int`, *optional*): + The dimension to use for spatial norm if it is to be used instead of group norm. + pad_mode (str, defaults to `"constant"`): + Padding mode. + """ + + def __init__( + self, + in_channels: int, + out_channels: Optional[int] = None, + dropout: float = 0.0, + temb_channels: int = 512, + groups: int = 32, + eps: float = 1e-6, + conv_shortcut: bool = False, + spatial_norm_dim: Optional[int] = None, + pad_mode: str = "constant", + norm_type = None, + ): + super().__init__() + norm_layer = get_norm(norm_type) + out_channels = out_channels or in_channels + + self.in_channels = in_channels + self.out_channels = out_channels + self.nonlinearity = nn.SiLU() + self.use_conv_shortcut = conv_shortcut + self.spatial_norm_dim = spatial_norm_dim + + if spatial_norm_dim is None: + self.norm1 = norm_layer(num_channels=in_channels, num_groups=groups, eps=eps) + self.norm2 = norm_layer(num_channels=out_channels, num_groups=groups, eps=eps) + else: + self.norm1 = CogVideoXSpatialNorm3D( + f_channels=in_channels, + zq_channels=spatial_norm_dim, + groups=groups, + norm_type=norm_type, + pad_mode=pad_mode, + ) + self.norm2 = CogVideoXSpatialNorm3D( + f_channels=out_channels, + zq_channels=spatial_norm_dim, + groups=groups, + norm_type=norm_type, + pad_mode=pad_mode, + ) + + self.conv1 = CogVideoXCausalConv3d( + in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode + ) + + if temb_channels > 0: + self.temb_proj = nn.Linear(in_features=temb_channels, out_features=out_channels) + + self.dropout = nn.Dropout(dropout) + self.conv2 = CogVideoXCausalConv3d( + in_channels=out_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode + ) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CogVideoXCausalConv3d( + in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode + ) + else: + self.conv_shortcut = CogVideoXSafeConv3d( + in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward( + self, + inputs: torch.Tensor, + temb: Optional[torch.Tensor] = None, + zq: Optional[torch.Tensor] = None, + conv_cache: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + hidden_states = inputs + + if zq is not None: + hidden_states, new_conv_cache["norm1"] = self.norm1(hidden_states, zq, conv_cache=conv_cache.get("norm1")) + else: + hidden_states = self.norm1(hidden_states) + + hidden_states = self.nonlinearity(hidden_states) + hidden_states, new_conv_cache["conv1"] = self.conv1(hidden_states, conv_cache=conv_cache.get("conv1")) + + if temb is not None: + hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None] + + if zq is not None: + hidden_states, new_conv_cache["norm2"] = self.norm2(hidden_states, zq, conv_cache=conv_cache.get("norm2")) + else: + hidden_states = self.norm2(hidden_states) + + hidden_states = self.nonlinearity(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states, new_conv_cache["conv2"] = self.conv2(hidden_states, conv_cache=conv_cache.get("conv2")) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + inputs, new_conv_cache["conv_shortcut"] = self.conv_shortcut( + inputs, conv_cache=conv_cache.get("conv_shortcut") + ) + else: + inputs = self.conv_shortcut(inputs) + + hidden_states = hidden_states + inputs + return hidden_states, new_conv_cache + + +class CogVideoXDownBlock3D(nn.Module): + r""" + A downsampling block used in the CogVideoX model. + + Args: + in_channels (`int`): + Number of input channels. + out_channels (`int`, *optional*): + Number of output channels. If None, defaults to `in_channels`. + temb_channels (`int`, defaults to `512`): + Number of time embedding channels. + num_layers (`int`, defaults to `1`): + Number of resnet layers. + dropout (`float`, defaults to `0.0`): + Dropout rate. + resnet_eps (`float`, defaults to `1e-6`): + Epsilon value for normalization layers. + resnet_groups (`int`, defaults to `32`): + Number of groups to separate the channels into for group normalization. + add_downsample (`bool`, defaults to `True`): + Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension. + compress_time (`bool`, defaults to `False`): + Whether or not to downsample across temporal dimension. + pad_mode (str, defaults to `"constant"`): + Padding mode. + """ + + _supports_gradient_checkpointing = True + + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_groups: int = 32, + add_downsample: bool = True, + downsample_padding: int = 0, + compress_time = None, + compress_spatial = None, + pad_mode: str = "constant", + norm_type = None, + down_layer = "conv", + down_block_mode = "cogvideox", + down_norm = False, + ): + super().__init__() + + if down_block_mode == "cogvideox": + resnets = [] + for i in range(num_layers): + in_channel = in_channels if i == 0 else out_channels + resnets.append( + CogVideoXResnetBlock3D( + in_channels=in_channel, + out_channels=out_channels, + dropout=dropout, + temb_channels=temb_channels, + groups=resnet_groups, + eps=resnet_eps, + pad_mode=pad_mode, + norm_type=norm_type + ) + ) + self.resnets = nn.ModuleList(resnets) + self.downsamplers = None + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + CogVideoXDownsample3D( + out_channels, out_channels, padding=downsample_padding, compress_time=compress_time, down_layer=down_layer, down_norm=down_norm, pad_mode=pad_mode, norm_type=norm_type, + ) + ] + ) + elif down_block_mode == "dc": + resnets = [] + for i in range(num_layers): + resnets.append( + CogVideoXResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + dropout=dropout, + temb_channels=temb_channels, + groups=resnet_groups, + eps=resnet_eps, + pad_mode=pad_mode, + norm_type=norm_type + ) + ) + self.resnets = nn.ModuleList(resnets) + self.downsamplers = None + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + CogVideoXDownsample3D( + in_channels, out_channels, padding=downsample_padding, compress_time=compress_time, down_layer=down_layer,down_norm=down_norm, pad_mode=pad_mode, norm_type=norm_type, + ) + ] + ) + else: + raise NotImplementedError(f"Invalid `down_block_mode` {down_block_mode} encountered. ") + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + zq: Optional[torch.Tensor] = None, + conv_cache: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + r"""Forward method of the `CogVideoXDownBlock3D` class.""" + + new_conv_cache = {} + conv_cache = conv_cache or {} + + for i, resnet in enumerate(self.resnets): + conv_cache_key = f"resnet_{i}" + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def create_forward(*inputs): + return module(*inputs) + + return create_forward + + hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + zq, + conv_cache.get(conv_cache_key), + use_reentrant=False + ) + else: + hidden_states, new_conv_cache[conv_cache_key] = resnet( + hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key) + ) + + if self.downsamplers is not None: + for i, downsampler in enumerate(self.downsamplers): + conv_cache_key = f"downsampler_{i}" + hidden_states, new_conv_cache[conv_cache_key] = downsampler(hidden_states, conv_cache=conv_cache.get(conv_cache_key)) + + return hidden_states, new_conv_cache + + +class CogVideoXMidBlock3D(nn.Module): + r""" + A middle block used in the CogVideoX model. + + Args: + in_channels (`int`): + Number of input channels. + temb_channels (`int`, defaults to `512`): + Number of time embedding channels. + dropout (`float`, defaults to `0.0`): + Dropout rate. + num_layers (`int`, defaults to `1`): + Number of resnet layers. + resnet_eps (`float`, defaults to `1e-6`): + Epsilon value for normalization layers. + resnet_groups (`int`, defaults to `32`): + Number of groups to separate the channels into for group normalization. + spatial_norm_dim (`int`, *optional*): + The dimension to use for spatial norm if it is to be used instead of group norm. + pad_mode (str, defaults to `"constant"`): + Padding mode. + """ + + _supports_gradient_checkpointing = True + + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_groups: int = 32, + spatial_norm_dim: Optional[int] = None, + pad_mode: str = "constant", + norm_type = None + ): + super().__init__() + + resnets = [] + for _ in range(num_layers): + resnets.append( + CogVideoXResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + dropout=dropout, + temb_channels=temb_channels, + groups=resnet_groups, + eps=resnet_eps, + spatial_norm_dim=spatial_norm_dim, + pad_mode=pad_mode, + norm_type=norm_type, + ) + ) + self.resnets = nn.ModuleList(resnets) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + zq: Optional[torch.Tensor] = None, + conv_cache: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + r"""Forward method of the `CogVideoXMidBlock3D` class.""" + + new_conv_cache = {} + conv_cache = conv_cache or {} + + for i, resnet in enumerate(self.resnets): + conv_cache_key = f"resnet_{i}" + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def create_forward(*inputs): + return module(*inputs) + + return create_forward + + hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key), use_reentrant=False + ) + else: + hidden_states, new_conv_cache[conv_cache_key] = resnet( + hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key) + ) + + return hidden_states, new_conv_cache + + +class CogVideoXUpBlock3D(nn.Module): + r""" + An upsampling block used in the CogVideoX model. + + Args: + in_channels (`int`): + Number of input channels. + out_channels (`int`, *optional*): + Number of output channels. If None, defaults to `in_channels`. + temb_channels (`int`, defaults to `512`): + Number of time embedding channels. + dropout (`float`, defaults to `0.0`): + Dropout rate. + num_layers (`int`, defaults to `1`): + Number of resnet layers. + resnet_eps (`float`, defaults to `1e-6`): + Epsilon value for normalization layers. + resnet_groups (`int`, defaults to `32`): + Number of groups to separate the channels into for group normalization. + spatial_norm_dim (`int`, defaults to `16`): + The dimension to use for spatial norm if it is to be used instead of group norm. + add_upsample (`bool`, defaults to `True`): + Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension. + compress_time (`bool`, defaults to `False`): + Whether or not to downsample across temporal dimension. + pad_mode (str, defaults to `"constant"`): + Padding mode. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_groups: int = 32, + spatial_norm_dim: int = 16, + add_upsample: bool = True, + upsample_padding: int = 1, + compress_time = None, + compress_spatial = None, + pad_mode: str = "constant", + norm_type = None, + up_layer = "conv", + up_block_mode="cogvideox", + up_norm = False, + ): + super().__init__() + + if up_block_mode == "cogvideox": + resnets = [] + for i in range(num_layers): + in_channel = in_channels if i == 0 else out_channels + resnets.append( + CogVideoXResnetBlock3D( + in_channels=in_channel, + out_channels=out_channels, + dropout=dropout, + temb_channels=temb_channels, + groups=resnet_groups, + eps=resnet_eps, + spatial_norm_dim=spatial_norm_dim, + pad_mode=pad_mode, + norm_type=norm_type, + ) + ) + self.resnets = nn.ModuleList(resnets) + self.upsamplers = None + if add_upsample: + self.upsamplers = nn.ModuleList( + [ + CogVideoXUpsample3D( + out_channels, out_channels, padding=upsample_padding, compress_time=compress_time, up_layer=up_layer, up_norm=up_norm, norm_type=norm_type, pad_mode=pad_mode + ) + ] + ) + elif up_block_mode == "dc": + resnets = [] + for i in range(num_layers): + resnets.append( + CogVideoXResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + dropout=dropout, + temb_channels=temb_channels, + groups=resnet_groups, + eps=resnet_eps, + spatial_norm_dim=spatial_norm_dim, + pad_mode=pad_mode, + norm_type=norm_type, + ) + ) + self.resnets = nn.ModuleList(resnets) + self.upsamplers = None + if add_upsample: + self.upsamplers = nn.ModuleList( + [ + CogVideoXUpsample3D( + in_channels, out_channels, padding=upsample_padding, compress_time=compress_time, up_layer=up_layer, up_norm=up_norm, norm_type=norm_type, pad_mode=pad_mode + ) + ] + ) + else: + raise NotImplementedError(f"Invalid `up_block_mode` {up_block_mode} encountered. ") + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + zq: Optional[torch.Tensor] = None, + conv_cache: Optional[Dict[str, torch.Tensor]] = None, + split_first = False, + ) -> torch.Tensor: + r"""Forward method of the `CogVideoXUpBlock3D` class.""" + + new_conv_cache = {} + conv_cache = conv_cache or {} + + for i, resnet in enumerate(self.resnets): + conv_cache_key = f"resnet_{i}" + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def create_forward(*inputs): + return module(*inputs) + + return create_forward + + hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + zq, + conv_cache.get(conv_cache_key), + use_reentrant=False + ) + else: + hidden_states, new_conv_cache[conv_cache_key] = resnet( + hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key) + ) + + if self.upsamplers is not None: + for i, upsampler in enumerate(self.upsamplers): + conv_cache_key = f"upsampler_{i}" + hidden_states, new_conv_cache[conv_cache_key] = upsampler(hidden_states, conv_cache=conv_cache.get(conv_cache_key), split_first=split_first) + + return hidden_states, new_conv_cache + + +class CogVideoXEncoder3D(nn.Module): + _supports_gradient_checkpointing = True + def __init__( + self, + in_channels: int = 3, + out_channels: int = 16, + down_block_types: Tuple[str, ...] = ( + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ), + block_out_channels: Tuple[int, ...] = (128, 128, 256, 256, 512), + layers_per_block: int = 3, + act_fn: str = "silu", + norm_eps: float = 1e-6, + norm_num_groups: int = 32, + dropout: float = 0.0, + pad_mode: str = "constant", + temporal_compression_list: list = [], + spatial_compression_list: list = [], + norm_type=None, + down_layer = "conv", + down_block_mode = "cogvideox", + down_norm=False, + ): + super().__init__() + + norm_layer = get_norm(norm_type) + # log2 of temporal_compress_times + # temporal_compress_level = int(np.log2(temporal_compression_ratio)) + + self.conv_in = CogVideoXCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode) + + self.down_blocks = nn.ModuleList([]) + + # down blocks + for i, down_block_type in enumerate(down_block_types): + input_channel = block_out_channels[i] + output_channel = block_out_channels[i+1] + compress_time = temporal_compression_list[i] if i < len(temporal_compression_list) else None + compress_spatial = spatial_compression_list[i] if i < len(spatial_compression_list) else None + + if down_block_type == "CogVideoXDownBlock3D": + down_block = CogVideoXDownBlock3D( + in_channels=input_channel, + out_channels=output_channel, + temb_channels=0, + dropout=dropout, + num_layers=layers_per_block, + resnet_eps=norm_eps, + resnet_groups=norm_num_groups, + add_downsample=compress_time or compress_spatial, + compress_time=compress_time, + compress_spatial=compress_spatial, + pad_mode=pad_mode, + norm_type=norm_type, + down_layer=down_layer, + down_block_mode=down_block_mode, + down_norm=down_norm, + ) + else: + raise ValueError("Invalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`") + + self.down_blocks.append(down_block) + + # mid block + self.mid_block = CogVideoXMidBlock3D( + in_channels=block_out_channels[len(down_block_types)], + temb_channels=0, + dropout=dropout, + num_layers=2, + resnet_eps=norm_eps, + resnet_groups=norm_num_groups, + pad_mode=pad_mode, + norm_type=norm_type, + ) + + self.norm_out = norm_layer(num_channels=block_out_channels[len(down_block_types)], num_groups=norm_num_groups, eps=1e-6) + self.conv_act = nn.SiLU() + self.conv_out = CogVideoXCausalConv3d( + block_out_channels[len(down_block_types)], 2 * out_channels, kernel_size=3, pad_mode=pad_mode + ) + + self.gradient_checkpointing = False + + def forward( + self, + sample: torch.Tensor, + temb: Optional[torch.Tensor] = None, + conv_cache: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + r"""The forward method of the `CogVideoXEncoder3D` class.""" + + new_conv_cache = {} + conv_cache = conv_cache or {} + + hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in")) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + # 1. Down + for i, down_block in enumerate(self.down_blocks): + conv_cache_key = f"down_block_{i}" + hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint( + create_custom_forward(down_block), + hidden_states, + temb, + None, + conv_cache.get(conv_cache_key), + use_reentrant=False + ) + + # 2. Mid + hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), + hidden_states, + temb, + None, + conv_cache.get("mid_block"), + use_reentrant=False + ) + else: + # 1. Down + for i, down_block in enumerate(self.down_blocks): + conv_cache_key = f"down_block_{i}" + hidden_states, new_conv_cache[conv_cache_key] = down_block( + hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key) + ) + + # 2. Mid + hidden_states, new_conv_cache["mid_block"] = self.mid_block( + hidden_states, temb, None, conv_cache=conv_cache.get("mid_block") + ) + + # 3. Post-process + hidden_states = self.norm_out(hidden_states) + hidden_states = self.conv_act(hidden_states) + + hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out")) + + return hidden_states, new_conv_cache + + +class CogVideoXDecoder3D(nn.Module): + r""" + The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output + sample. + + Args: + in_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_channels (`int`, *optional*, defaults to 3): + The number of output channels. + up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`): + The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options. + block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): + The number of output channels for each block. + act_fn (`str`, *optional*, defaults to `"silu"`): + The activation function to use. See `~diffusers.models.activations.get_activation` for available options. + layers_per_block (`int`, *optional*, defaults to 2): + The number of layers per block. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups for normalization. + """ + + _supports_gradient_checkpointing = True + + def __init__( + self, + in_channels: int = 16, + out_channels: int = 3, + up_block_types: Tuple[str, ...] = ( + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ), + block_out_channels: Tuple[int, ...] = (128, 128, 256, 256, 512), + layers_per_block: int = 3, + act_fn: str = "silu", + norm_eps: float = 1e-6, + norm_num_groups: int = 32, + dropout: float = 0.0, + pad_mode: str = "constant", + temporal_compression_list: list = [], + spatial_compression_list: list = [], + norm_type=None, + up_layer="conv", + up_block_mode="cogvideox", + up_norm=False, + ): + super().__init__() + + reversed_block_out_channels = list(reversed(block_out_channels)) + + self.conv_in = CogVideoXCausalConv3d( + in_channels, reversed_block_out_channels[0], kernel_size=3, pad_mode=pad_mode + ) + + # mid block + self.mid_block = CogVideoXMidBlock3D( + in_channels=reversed_block_out_channels[0], + temb_channels=0, + num_layers=2, + resnet_eps=norm_eps, + resnet_groups=norm_num_groups, + spatial_norm_dim=in_channels, + pad_mode=pad_mode, + norm_type=norm_type, + ) + + # up blocks + self.up_blocks = nn.ModuleList([]) + + # output_channel = reversed_block_out_channels[0] + # temporal_compress_level = int(np.log2(temporal_compression_ratio)) + + for i, up_block_type in enumerate(up_block_types): + prev_output_channel = reversed_block_out_channels[i] + output_channel = reversed_block_out_channels[i+1] + if up_block_mode == "cogvideox": + raise NotImplementedError + is_final_block = i == len(up_block_types) - 1 + compress_time = temporal_compression_list[i] if i < len(temporal_compression_list) else None + compress_spatial = spatial_compression_list[i] if i < len(spatial_compression_list) else None + elif up_block_mode == "dc": + # is_final_block = i == 0 + idx_temporal = i - (len(up_block_types) - len(temporal_compression_list)) + compress_time = temporal_compression_list[-idx_temporal] if idx_temporal >= 0 else None + idx_spatial = i - (len(up_block_types) - len(spatial_compression_list)) + compress_spatial = spatial_compression_list[-idx_spatial] if idx_spatial >= 0 else None + # print(temporal_compression_list, idx_temporal, compress_time, spatial_compression_list, idx_spatial, compress_spatial, compress_time or compress_spatial) + + if up_block_type == "CogVideoXUpBlock3D": + up_block = CogVideoXUpBlock3D( + in_channels=prev_output_channel, + out_channels=output_channel, + temb_channels=0, + dropout=dropout, + num_layers=layers_per_block + 1, + resnet_eps=norm_eps, + resnet_groups=norm_num_groups, + spatial_norm_dim=in_channels, + add_upsample=compress_time or compress_spatial, + compress_time=compress_time, + compress_spatial=compress_spatial, + pad_mode=pad_mode, + norm_type=norm_type, + up_layer=up_layer, + up_block_mode=up_block_mode, + up_norm=up_norm, + ) + prev_output_channel = output_channel + else: + raise ValueError("Invalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`") + + self.up_blocks.append(up_block) + + self.norm_out = CogVideoXSpatialNorm3D(reversed_block_out_channels[len(up_block_types)], in_channels, groups=norm_num_groups, norm_type=norm_type, pad_mode=pad_mode) + self.conv_act = nn.SiLU() + self.conv_out = CogVideoXCausalConv3d( + reversed_block_out_channels[len(up_block_types)], out_channels, kernel_size=3, pad_mode=pad_mode + ) + + self.gradient_checkpointing = False + + def forward( + self, + sample: torch.Tensor, + temb: Optional[torch.Tensor] = None, + conv_cache: Optional[Dict[str, torch.Tensor]] = None, + split_first = False, + ) -> torch.Tensor: + r"""The forward method of the `CogVideoXDecoder3D` class.""" + + new_conv_cache = {} + conv_cache = conv_cache or {} + + hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in")) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + # 1. Mid + hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), + hidden_states, + temb, + sample, + conv_cache.get("mid_block"), + use_reentrant=False + ) + + # 2. Up + for i, up_block in enumerate(self.up_blocks): + conv_cache_key = f"up_block_{i}" + hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint( + create_custom_forward(up_block), + hidden_states, + temb, + sample, + conv_cache.get(conv_cache_key), + split_first, + use_reentrant=False + ) + else: + # 1. Mid + hidden_states, new_conv_cache["mid_block"] = self.mid_block( + hidden_states, temb, sample, conv_cache=conv_cache.get("mid_block") + ) + + # 2. Up + for i, up_block in enumerate(self.up_blocks): + conv_cache_key = f"up_block_{i}" + hidden_states, new_conv_cache[conv_cache_key] = up_block( + hidden_states, temb, sample, conv_cache=conv_cache.get(conv_cache_key), split_first=split_first + ) + + # 3. Post-process + hidden_states, new_conv_cache["norm_out"] = self.norm_out( + hidden_states, sample, conv_cache=conv_cache.get("norm_out") + ) + hidden_states = self.conv_act(hidden_states) + hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out")) + + return hidden_states, new_conv_cache + + +class AutoencoderKLCogVideoX(nn.Module): + _supports_gradient_checkpointing = True + _no_split_modules = ["CogVideoXResnetBlock3D"] + + def __init__( + self, + args + ): + super().__init__() + self.args = args + self.embed_dim = args.latent_channels + self.encoder_dtype = ptdtype[args.encoder_dtype] + self.decoder_dtype = ptdtype[args.decoder_dtype] + + self.encoder = CogVideoXEncoder3D( + in_channels=args.in_channels, + out_channels=args.latent_channels, + down_block_types=args.down_block_types, + block_out_channels=args.block_out_channels, + layers_per_block=args.layers_per_block, + act_fn=args.act_fn, + norm_eps=args.norm_eps, + norm_num_groups=args.norm_num_groups, + temporal_compression_list=args.temporal_compression_list, + spatial_compression_list=args.spatial_compression_list, + pad_mode=args.pad_mode, + norm_type=args.norm_type, + down_layer=args.down_layer, + down_block_mode=args.down_block_mode, + down_norm=args.down_norm, + ) + self.decoder = CogVideoXDecoder3D( + in_channels=args.latent_channels, + out_channels=args.out_channels, + up_block_types=args.up_block_types, + block_out_channels=args.block_out_channels, + layers_per_block=args.layers_per_block, + act_fn=args.act_fn, + norm_eps=args.norm_eps, + norm_num_groups=args.norm_num_groups, + temporal_compression_list=args.temporal_compression_list, + spatial_compression_list=args.spatial_compression_list, + pad_mode=args.pad_mode, + norm_type=args.norm_type, + up_layer=args.up_layer, + up_block_mode=args.up_block_mode, + up_norm=args.up_norm, + ) + self.dropout_z_layer = nn.Dropout(p=args.dropout_z) + if args.use_checkpoint: + self._set_gradient_checkpointing(self.encoder, True) + self._set_gradient_checkpointing(self.decoder, True) + + if args.fix_model != ["no"]: + for _model in args.fix_model: + if _model == "encoder": + self._set_no_grad(self.encoder) + elif _model == "decoder": + self._set_no_grad(self.decoder) + elif _model.startswith("down_blocks"): + fix_block_num = int(_model.split("_")[2]) + self._set_no_grad(self.encoder.conv_in) + for idx in range(fix_block_num): + self._set_no_grad(self.encoder.down_blocks[idx]) + elif _model.startswith("up_blocks"): + fix_block_num = int(_model.split("_")[2]) + self._set_no_grad(self.decoder.conv_out) + self._set_no_grad(self.decoder.norm_out) + for idx in range(fix_block_num): + total_num = len(self.decoder.up_blocks) + self._set_no_grad(self.decoder.up_blocks[total_num - idx - 1]) # reverse fix + else: + raise NotImplementedError + + print("Learnable Parameters:") + for name, param in self.named_parameters(): + if param.requires_grad: + print(name) + + # for down_block in self.encoder.down_blocks: + # if down_block.downsamplers is not None: + # print(f"downsample compress time {down_block.downsamplers[0].compress_time}") + # else: + # print(f"downsample None") + # for up_block in self.decoder.up_blocks: + # if up_block.upsamplers is not None: + # print(f"upsample compress time {up_block.upsamplers[0].compress_time}") + # else: + # print("upsample None") + + self.quant_conv = CogVideoXSafeConv3d(2 * args.out_channels, 2 * args.out_channels, 1) if args.use_quant_conv else None + self.post_quant_conv = CogVideoXSafeConv3d(args.out_channels, args.out_channels, 1) if args.use_post_quant_conv else None + + self.use_slicing = False + self.use_tiling = False + + # Can be increased to decode more latent frames at once, but comes at a reasonable memory cost and it is not + # recommended because the temporal parts of the VAE, here, are tricky to understand. + # If you decode X latent frames together, the number of output frames is: + # (X + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) => X + 6 frames + # + # Example with num_latent_frames_batch_size = 2: + # - 12 latent frames: (0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11) are processed together + # => (12 // 2 frame slices) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) + # => 6 * 8 = 48 frames + # - 13 latent frames: (0, 1, 2) (special case), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12) are processed together + # => (1 frame slice) * ((3 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) + + # ((13 - 3) // 2) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) + # => 1 * 9 + 5 * 8 = 49 frames + # It has been implemented this way so as to not have "magic values" in the code base that would be hard to explain. Note that + # setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different + # number of temporal frames. + self.num_latent_frames_batch_size = 2 + self.num_sample_frames_batch_size = 2 * int(math.prod([float(a) for a in self.args.temporal_compression_list])) + + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = args.sample_height // 2 + self.tile_sample_min_width = args.sample_width // 2 + self.tile_latent_min_height = int( + self.tile_sample_min_height / 8 + ) + self.tile_latent_min_width = int(self.tile_sample_min_width / 8) + + # These are experimental overlap factors that were chosen based on experimentation and seem to work best for + # 720x480 (WxH) resolution. The above resolution is the strongly recommended generation resolution in CogVideoX + # and so the tiling implementation has only been tested on those specific resolutions. + self.tile_overlap_factor_height = 1 / 6 + self.tile_overlap_factor_width = 1 / 5 + + if cp.is_cp_initialized(): + self.cp_size = cp.get_cp_size() + self.cp_rank = cp.get_cp_rank() + + self.lfq_weight = args.lfq_weight + self.commitment_loss_weight = args.commitment_loss_weight + self.compute_all_commitment = args.compute_all_commitment # compute commitment between input and rq-output + if args.quantizer_type == 'MultiScaleBSQ': + quantizer_class = MultiScaleBSQ + elif args.quantizer_type == 'MultiScaleBSQTP': + quantizer_class = MultiScaleBSQTP_AP + elif args.quantizer_type == 'MultiScaleFSQ': + quantizer_class = MultiScaleFSQ + elif args.quantizer_type == 'MultiScaleFSQTP': + quantizer_class = MultiScaleFSQTP + elif args.quantizer_type == 'MultiScaleFSQSIM': + quantizer_class = MultiScaleFSQSIM + else: + raise NotImplementedError + + ratio2hws_video_common_v2, total_pixels2scales = get_ratio2hws_video_v2() + scales_256 = total_pixels2scales['0.06M'] + h_div_w2hw = {} + for h_div_w in ratio2hws_video_common_v2: + h_div_w2hw[h_div_w] = ratio2hws_video_common_v2[h_div_w][scales_256-1] + h_div_w2hw[1/h_div_w] = (h_div_w2hw[h_div_w][1], h_div_w2hw[h_div_w][0]) + self.h_div_w2hw = h_div_w2hw + self.h_div_w_templates = np.array(list(self.h_div_w2hw.keys())) + self.scales_256 = scales_256 + args.h_div_w2hw = h_div_w2hw + args.h_div_w_templates = self.h_div_w_templates + args.scales_256 = scales_256 + dim = args.codebook_dim if args.codebook_dim_low < 0 else args.codebook_dim_low * 4 + self.quantizer = quantizer_class( + dim = args.codebook_dim_low * 4, # this is the input feature dimension, defaults to log2(codebook_size) if not defined + entropy_loss_weight = args.entropy_loss_weight, # how much weight to place on entropy loss + commitment_loss_weight=args.commitment_loss_weight, # loss weight of commitment loss + use_stochastic_depth=args.use_stochastic_depth, + drop_rate=args.drop_rate, + schedule_mode=args.schedule_mode, + keep_first_quant=args.keep_first_quant, + keep_last_quant=args.keep_last_quant, + remove_residual_detach=args.remove_residual_detach, + use_out_phi=args.use_out_phi, + use_out_phi_res=args.use_out_phi_res, + random_flip = args.random_flip, + flip_prob = args.flip_prob, + flip_mode = args.flip_mode, + max_flip_lvl = args.max_flip_lvl, + random_flip_1lvl = args.random_flip_1lvl, + flip_lvl_idx = args.flip_lvl_idx, + drop_when_test = args.drop_when_test, + drop_lvl_idx = args.drop_lvl_idx, + drop_lvl_num = args.drop_lvl_num, + random_short_schedule = args.random_short_schedule, + short_schedule_prob = args.short_schedule_prob, + use_bernoulli = args.use_bernoulli, + use_rot_trick = args.use_rot_trick, + disable_flip_prob = args.disable_flip_prob, + casual_multi_scale = args.casual_multi_scale, + temporal_slicing = args.temporal_slicing, + last_scale_repeat_n = args.last_scale_repeat_n, + num_lvl_fsq = args.num_lvl_fsq, + other_args=args, + ) + self.quantize = self.quantizer + self.codebook_dim_continuous = args.codebook_dim + assert args.codebook_dim_low > 0 + self.codebook_dim = args.codebook_dim_low * 4 + self.vocab_size = 2**self.codebook_dim + + if args.freeze_encoder: + for param in self.encoder.parameters(): + param.requires_grad = False + if args.freeze_decoder: + for param in self.decoder.parameters(): + param.requires_grad = False + + self.origin_dim = 64 + assert args.use_feat_proj in [0, 1, 2], f'use_feat_proj must be 0, 1, 2' + if args.use_feat_proj > 0: + if args.use_feat_proj == 1: + self.proj_down = nn.Linear(self.origin_dim*2, self.origin_dim*2) + self.proj_down_two = nn.Linear(self.origin_dim*2, self.origin_dim*2) + elif args.use_feat_proj == 2: + self.proj_down = nn.Linear(self.origin_dim, self.origin_dim) + self.proj_down_two = nn.Linear(self.origin_dim, self.origin_dim) + self.proj_up = nn.Linear(self.origin_dim, self.origin_dim) + self.proj_up_two = nn.Linear(self.origin_dim, self.origin_dim) + else: + self.proj_down, self.proj_up, self.proj_down_two, self.proj_up_two = nn.Identity(), nn.Identity(), nn.Identity(), nn.Identity() + self.other_args = args + self.scale_learnable_parameters = nn.Parameter(torch.ones(4)) + + def _set_gradient_checkpointing(self, module, value=False, subset=True): + if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)): + module.gradient_checkpointing = value + + for n, m in module.named_modules(): + if hasattr(m, 'gradient_checkpointing') and subset: + m.gradient_checkpointing = value + + def _set_no_grad(self, module): + for param in module.parameters(): + param.requires_grad = False + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int( + self.tile_sample_min_height / 8 + ) + self.tile_latent_min_width = int(self.tile_sample_min_width / 8) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + r""" + Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing + decoding in one step. + """ + self.use_tiling = False + + def enable_slicing(self) -> None: + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.use_slicing = True + + def disable_slicing(self) -> None: + r""" + Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing + decoding in one step. + """ + self.use_slicing = False + + def _encode(self, x: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, num_frames, height, width = x.shape + self.raw_height = height + self.raw_width = width + + if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height): + return self.tiled_encode(x) + + frame_batch_size = self.num_sample_frames_batch_size + # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k. + # As the extra single frame is handled inside the loop, it is not required to round up here. + num_batches = max(num_frames // frame_batch_size, 1) + if num_batches > 1: + if cp.is_cp_initialized(): + frame_batch_size = num_frames // self.cp_size + num_batches = self.cp_size + cp.set_cp_on(True) + else: + cp.set_cp_on(False) + + + conv_cache = None + enc = [] + + for i in range(num_batches): + if cp.cp_on() and i != self.cp_rank: + continue + + remaining_frames = num_frames % frame_batch_size + start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames) + end_frame = frame_batch_size * (i + 1) + remaining_frames + x_intermediate = x[:, :, start_frame:end_frame] + + + torch._dynamo.mark_dynamic(x_intermediate, 0) + torch._dynamo.mark_dynamic(x_intermediate, 2) + if conv_cache is not None: + for key, tensor in conv_cache.items(): + if tensor is not None and isinstance(tensor, torch.Tensor): + torch._dynamo.mark_dynamic(tensor, 0) + + x_intermediate, conv_cache = self.encoder(x_intermediate, conv_cache=conv_cache) + + if self.quant_conv is not None: + x_intermediate = self.quant_conv(x_intermediate) + + enc.append(x_intermediate) + + if cp.cp_on(): + enc = dist_encoder_gather_result(enc[0]) + + enc = torch.cat(enc, dim=2) + + return enc + + def encode_for_raw_features( + self, x: torch.Tensor, + scale_schedule, + return_residual_norm_per_scale=False, + slice=None, + ): + is_image = x.ndim == 4 + if not is_image: + B, C, T, H, W = x.shape + else: + B, C, H, W = x.shape + T = 1 + x = x.unsqueeze(2) + + with torch.amp.autocast("cuda", dtype=self.encoder_dtype): + h = self.encode(x) + # adjust latent dim + h = patchify(h) # (B,c,t,H,W) -> (B,4c,t,H/2,W/2) + + posterior = DiagonalGaussianDistribution(h) + z = posterior.sample() + z = self.dropout_z_layer(z) + if self.other_args.use_feat_proj == 2: + z = self.proj_down(z.permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,24,t,H/2,W/2) + z = z * self.scale_learnable_parameters[0] + return z, None, None + + + def encode( + self, x: torch.Tensor, return_dict: bool = True + ): + h = None + if self.use_slicing and x.shape[0] > 1: + encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)] + h = torch.cat(encoded_slices) + else: + h = self._encode(x) + + if not return_dict: + return (h,) + return h + + def _decode(self, z: torch.Tensor, return_dict: bool = True): + batch_size, num_channels, num_frames, height, width = z.shape + + if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height): + return self.tiled_decode(z, return_dict=return_dict) + + frame_batch_size = self.num_latent_frames_batch_size + + num_batches = max(num_frames // frame_batch_size, 1) + split_first = False + if num_frames % frame_batch_size == 0 and num_batches: + split_first = True + num_batches -= 1 + if num_batches > 1: + if cp.is_cp_initialized(): + frame_batch_size = num_frames // self.cp_size + num_batches = self.cp_size + cp.set_cp_on(True) + else: + cp.set_cp_on(False) + + conv_cache = None + dec = [] + + start_frame = 0 + remaining_frames = num_frames % frame_batch_size + if split_first: + remaining_frames += frame_batch_size + for i in range(num_batches): + if cp.cp_on() and i != self.cp_rank: + continue + + end_frame = frame_batch_size * (i + 1) + remaining_frames + z_intermediate = z[:, :, start_frame:end_frame] + start_frame = end_frame + if self.post_quant_conv is not None: + z_intermediate = self.post_quant_conv(z_intermediate) + + + torch._dynamo.mark_dynamic(z_intermediate, 0) + torch._dynamo.mark_dynamic(z_intermediate, 2) + torch._dynamo.mark_dynamic(z_intermediate, 3) + torch._dynamo.mark_dynamic(z_intermediate, 4) + if conv_cache is not None: + for key, tensor in conv_cache.items(): + if tensor is not None and isinstance(tensor, torch.Tensor): + torch._dynamo.mark_dynamic(tensor, 0) + + z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache, split_first=split_first) + split_first = False + + dec.append(z_intermediate) + + if cp.cp_on(): + dec = dist_decoder_gather_result(dec[0]) + + dec = torch.cat(dec, dim=2) + + if not return_dict: + return (dec,) + + return dec + + def decode(self, z: torch.Tensor, return_dict: bool = True, **kwargs): + + z = z / self.scale_learnable_parameters[0] + z = self.proj_up(z.permute(0,2,3,4,1)).permute(0,4,1,2,3) + + z = unpatchify(z) + if self.use_slicing and z.shape[0] > 1: + decoded_slices = [self._decode(z_slice) for z_slice in z.split(1)] + decoded = torch.cat(decoded_slices) + else: + decoded = self._decode(z) + + if not return_dict: + return (decoded,) + return decoded + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: + r"""Encode a batch of images using a tiled encoder. + + When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several + steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is + different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the + tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the + output, but they should be much less noticeable. + + Args: + x (`torch.Tensor`): Input batch of videos. + + Returns: + `torch.Tensor`: + The latent representation of the encoded videos. + """ + # For a rough memory estimate, take a look at the `tiled_decode` method. + batch_size, num_channels, num_frames, height, width = x.shape + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + frame_batch_size = self.num_sample_frames_batch_size + + # Split x into overlapping tiles and encode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k. + # As the extra single frame is handled inside the loop, it is not required to round up here. + num_batches = max(num_frames // frame_batch_size, 1) + conv_cache = None + time = [] + + for k in range(num_batches): + remaining_frames = num_frames % frame_batch_size + start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames) + end_frame = frame_batch_size * (k + 1) + remaining_frames + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile, conv_cache = self.encoder(tile, conv_cache=conv_cache) + if self.quant_conv is not None: + tile = self.quant_conv(tile) + time.append(tile) + + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + enc = torch.cat(result_rows, dim=3) + return enc + + def tiled_decode(self, z: torch.Tensor, return_dict: bool = True): + # Rough memory assessment: + # - In CogVideoX-2B, there are a total of 24 CausalConv3d layers. + # - The biggest intermediate dimensions are: [1, 128, 9, 480, 720]. + # - Assume fp16 (2 bytes per value). + # Memory required: 1 * 128 * 9 * 480 * 720 * 24 * 2 / 1024**3 = 17.8 GB + # + # Memory assessment when using tiling: + # - Assume everything as above but now HxW is 240x360 by tiling in half + # Memory required: 1 * 128 * 9 * 240 * 360 * 24 * 2 / 1024**3 = 4.5 GB + + batch_size, num_channels, num_frames, height, width = z.shape + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + frame_batch_size = self.num_latent_frames_batch_size + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + num_batches = max(num_frames // frame_batch_size, 1) + conv_cache = None + time = [] + + for k in range(num_batches): + remaining_frames = num_frames % frame_batch_size + start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames) + end_frame = frame_batch_size * (k + 1) + remaining_frames + tile = z[ + :, + :, + start_frame:end_frame, + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + if self.post_quant_conv is not None: + tile = self.post_quant_conv(tile) + tile, conv_cache = self.decoder(tile, conv_cache=conv_cache) + time.append(tile) + + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + + if not return_dict: + return (dec,) + + return dec + + ### original cogvideox forward + # def forward( + # self, + # sample: torch.Tensor, + # sample_posterior: bool = False, + # return_dict: bool = True, + # generator: Optional[torch.Generator] = None, + # ) -> Union[torch.Tensor, torch.Tensor]: + # x = sample + # posterior = self.encode(x).latent_dist + # if sample_posterior: + # z = posterior.sample(generator=generator) + # else: + # z = posterior.mode() + # dec = self.decode(z) + # if not return_dict: + # return (dec,) + # return dec + + def forward(self, x, disc_factor, image_disc=None, video_disc=None, image_perceptual_model=None, video_perceptual_model=None, is_train=True): + device = x.device + is_image = x.ndim == 4 + if not is_image: + B, C, T, H, W = x.shape + else: + B, C, H, W = x.shape + T = 1 + x = x.unsqueeze(2) + + semantic_enlarge_factor = torch.clamp(self.scale_learnable_parameters, min=0.01)[0] # for low resolution + detail_enlarge_factor = torch.clamp(self.scale_learnable_parameters, min=0.01)[1] # for high resolution + + h_div_w = H / W + h_div_w_template = self.h_div_w_templates[np.argmin(np.abs(self.h_div_w_templates - h_div_w))] + hh, ww = self.h_div_w2hw[h_div_w_template] + is_high_resolution = H*W > hh*ww*256 + x_list = [] + if self.other_args.use_multi_scale and is_high_resolution: + x_list.append(F.interpolate(x, size=(T, hh*16, ww*16), mode=self.quantizer.z_interplote_down)) + x_list.append(x) + assert len(x_list) <= 2 + z_list = [] + for i, x in enumerate(x_list): + with torch.amp.autocast("cuda", dtype=self.encoder_dtype): + h = self.encode(x) + # adjust latent dim + h = patchify(h) # (B,c,t,H,W) -> (B,4c,t,H/2,W/2) + + if self.other_args.use_feat_proj == 1: + if i==0: + h = self.proj_down(h.permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,24,t,H/2,W/2) + elif i==1: + h = self.proj_down_two(h.permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,24,t,H/2,W/2) + + posterior = DiagonalGaussianDistribution(h) + z = posterior.sample() + z = self.dropout_z_layer(z) + + if self.other_args.use_feat_proj == 2: + if i==0: + z = self.proj_down(z.permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,24,t,H/2,W/2) + elif i==1: + z = self.proj_down_two(z.permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,24,t,H/2,W/2) + + if i == 0: + z_list.append(z.clone() * semantic_enlarge_factor) + elif i==1: + z_list.append(z.clone() * detail_enlarge_factor) + + # quantize + # z_list_bk = z_list + z_list, all_indices, all_loss = self.quantizer(z_list) # (B,24,t,H/2,W/2) + # z_list = z_list_bk + + x_recon_list = [] + for i in range(len(z_list)): + if i==0: + z_list[i] = z_list[i] / semantic_enlarge_factor + z_list[i] = self.proj_up(z_list[i].permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,64,t,H/2,W/2) + elif i==1: + z_list[i] = z_list[i] / detail_enlarge_factor + z_list[i] = self.proj_up_two(z_list[i].permute(0,2,3,4,1)).permute(0,4,1,2,3) # (B,64,t,H/2,W/2) + + z_list[i] = unpatchify(z_list[i]) # (B,4c,t,H/2,W/2) -> (B,c,t,H,W) + + with torch.amp.autocast("cuda", dtype=self.decoder_dtype): + x_recon = self.decode(z_list[i]).to(torch.float32) + x_recon_list.append(x_recon) + + loss_dict, log_dict = {}, {} + log_dict['semantic_enlarge_factor'] = torch.tensor(self.scale_learnable_parameters[0].item(), device=device) + log_dict['detail_enlarge_factor'] = torch.tensor(self.scale_learnable_parameters[1].item(), device=device) + + if "FSQ" in self.args.quantizer_type: + vq_output = {"encodings": all_indices} + else: + vq_output = { + "commitment_loss": torch.mean(all_loss) * self.lfq_weight, # here commitment loss is sum of commitment loss and entropy penalty + "encodings": all_indices, + } + + # assert x.shape == x_recon.shape, f"x.shape {x.shape}, x_recon.shape {x_recon.shape}" + if is_train == False: + if self.other_args.return_256_res: + return x_list[0], x_recon_list[0] + else: + return x_list[-1], x_recon_list[-1] + + # if is_high_resolution_video: + # x_recon_list, x_list = x_recon_list[1:], x_list[1:] + if "FSQ" not in self.args.quantizer_type: + loss_dict["train/commitment_loss"] = vq_output['commitment_loss'] + # loss_dict["train/all_commitment_loss"] = vq_output['all_commitment_loss'] + for (x_recon, x) in zip(x_recon_list, x_list): + if self.args.recon_loss_type == 'l1': + recon_loss = F.l1_loss(x_recon, x) * self.args.l1_weight + else: + recon_loss = F.mse_loss(x_recon, x) * self.args.l1_weight + if 'train/recon_loss' not in loss_dict: + loss_dict['train/recon_loss'] = recon_loss + else: + loss_dict['train/recon_loss'] += recon_loss + + if is_image: # handle the cases with 4 dims + flat_frames = x = x.squeeze(2) + flat_frames_recon = x_recon = x_recon.squeeze(2) + else: + flat_frames = rearrange(x, "B C T H W -> (B T) C H W") + flat_frames_recon = rearrange(x_recon, "B C T H W -> (B T) C H W") + + # Perceptual loss + if is_image: + image_perceptual_loss = image_perceptual_model(flat_frames, flat_frames_recon).mean() * self.args.perceptual_weight + if "train/image_perceptual_loss" not in loss_dict: + loss_dict["train/image_perceptual_loss"] = image_perceptual_loss + else: + loss_dict["train/image_perceptual_loss"] += image_perceptual_loss + else: + if self.args.lpips_model == "swin3d_t": + video_perceptual_loss = video_perceptual_model(x, x_recon).mean() * self.args.video_perceptual_weight + else: + video_perceptual_loss = video_perceptual_model(flat_frames, flat_frames_recon).mean() * self.args.video_perceptual_weight + if "train/video_perceptual_loss" not in loss_dict: + loss_dict["train/video_perceptual_loss"] = video_perceptual_loss + else: + loss_dict["train/video_perceptual_loss"] += video_perceptual_loss + + ### GAN loss + if self.args.image_gan_weight > 0 and (self.args.gan_image4video == "yes" or is_image): + logits_image_fake = image_disc(flat_frames_recon) + g_image_loss = -torch.mean(logits_image_fake) * self.args.image_gan_weight * disc_factor + if 'train/g_image_loss' not in loss_dict: + loss_dict["train/g_image_loss"] = g_image_loss + else: + loss_dict["train/g_image_loss"] += g_image_loss + if T > 1 and self.args.video_gan_weight > 0: + logits_video_fake = video_disc(x_recon) + g_video_loss = -torch.mean(logits_video_fake) * self.args.video_gan_weight * disc_factor + if 'train/g_video_loss' not in loss_dict: + loss_dict["train/g_video_loss"] = g_video_loss + else: + loss_dict["train/g_video_loss"] += g_video_loss + + loss_dict['train/recon_loss'] /= len(x_list) + if "train/image_perceptual_loss" in loss_dict: + loss_dict["train/image_perceptual_loss"] /= len(x_list) + if "train/video_perceptual_loss" in loss_dict: + loss_dict["train/video_perceptual_loss"] /= len(x_list) + + x_recon1, flat_frames1, flat_frames_recon1 = x_recon.detach(), flat_frames.detach(), flat_frames_recon.detach() + + return (x, x_recon1, flat_frames1, flat_frames_recon1, loss_dict, log_dict) + + + @staticmethod + def add_model_specific_args(parent_parser): + from infinity.models.videovae.utils import str2bool + + parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument("--in_channels", type=int, default=3) + parser.add_argument("--out_channels", type=int, default=3) + parser.add_argument("--down_block_types", type=str, nargs='+', default=[ + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ]) + parser.add_argument("--down_block_mode", type=str, default="cogvideox", choices=["cogvideox", "dc"]) + parser.add_argument("--up_block_types", type=str, nargs='+', default=[ + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ]) + parser.add_argument("--up_block_mode", type=str, default="cogvideox", choices=["cogvideox", "dc"]) + parser.add_argument("--block_out_channels", type=int, nargs='+', default=[128, 128, 256, 256, 512, 512]) + parser.add_argument("--layers_per_block", type=int, default=3) + parser.add_argument("--latent_channels", type=int, default=16) + parser.add_argument("--act_fn", type=str, default="silu") + parser.add_argument("--norm_eps", type=float, default=1e-6) + parser.add_argument("--norm_num_groups", type=int, default=32) + # parser.add_argument("--temporal_compression_ratio", type=float, default=4) # deprecated + parser.add_argument("--spatial_compression_list", type=int, nargs='+', default=[2, 2, 2], choices=[2]) + parser.add_argument("--temporal_compression_list", type=int, nargs='+', default=[2, 2], choices=[2, 3]) + parser.add_argument("--sample_height", type=int, default=480) + parser.add_argument("--sample_width", type=int, default=720) + parser.add_argument("--use_quant_conv", action="store_true") + parser.add_argument("--use_post_quant_conv", action="store_true") + parser.add_argument("--down_layer", type=str, default="conv", choices=["conv", "dc", "3d-dc"]) + parser.add_argument('--down_norm', type=str2bool, default=False) + parser.add_argument("--up_layer", type=str, default="conv", choices=["conv", "dc", "3d-dc"]) + parser.add_argument('--up_norm', type=str2bool, default=False) + parser.add_argument("--pad_mode", type=str, default="constant", choices=["constant", "replicate"]) + parser.add_argument("--dropout_z", type=float, default=0.0) + return parser + +if __name__ == '__main__': + pass + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__init__.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e94e60a59b607b839f3e2ee329265f041c170e1a --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from .lpips import LPIPS, ResNet50LPIPS +from .codebook import Codebook, MultiScaleCodebook +from .normalization import Normalize, SpatialGroupNorm +from .conv import FluxConv, DCDownBlock2d, DCUpBlock2d, DCDownBlock3d, DCUpBlock3d, CogVideoXCausalConv3d, CogVideoXSafeConv3d +from .commitments import DiagonalGaussianDistribution +from .loss import adopt_weight +from .misc import swish \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28e4de9922d30ca8297d33b5c6940915e29f0e5e Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/codebook.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/codebook.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6808356733412d518e1f8abcce4befd1862a83d2 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/codebook.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/commitments.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/commitments.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e7471a299aac6a9fabc57c4ef4a5a06d6127701 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/commitments.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/conv.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/conv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49ca1a0d71f8bf41500ef21864fc4fff8506ecec Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/conv.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/conv_wan.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/conv_wan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1adceeaf008ad959ec58eb96e7df6acbb18dc2f Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/conv_wan.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/loss.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/loss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2800c48e543878352baa456af4eb733b9766f90d Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/loss.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/lpips.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/lpips.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72d898edb0db5d4b5d39afcd0814bc528566b479 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/lpips.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/misc.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0719ae16634f0911513c163b032b28d823a6e513 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/misc.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/normalization.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/normalization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b4bf4262eb8d8dba78b4a537466d9161d89adc2 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/normalization.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/normalization_wan.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/normalization_wan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02d97c9d4ffd859f137eed16635bb4a18a69942f Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/__pycache__/normalization_wan.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/attention.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e2e2a642fda91a5d90524540b580fb7e66c0b2 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/attention.py @@ -0,0 +1,702 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import math +from operator import truediv +import torch +import torch.nn.functional as F +from torch import nn, einsum +from beartype import beartype +from typing import Tuple + +from einops import rearrange, repeat +from einops.layers.torch import Rearrange +from timm.models.layers import to_2tuple, trunc_normal_ +from infinity.models.videovae.modules.drop_path import DropPath + +from fairscale.nn import checkpoint_wrapper +from torch.nn.attention import SDPBackend, sdpa_kernel +from infinity.models.videovae.utils.misc import is_dtype_16 +from infinity.models.videovae.modules.normalization import l2norm, LayerNorm, RMSNorm + + +def do_pool(x: torch.Tensor, stride: int) -> torch.Tensor: + # Refer to `Unroll` to see how this performs a maxpool-Nd + # B, N, C + return x.view(x.shape[0], stride, -1, x.shape[-1]).max(dim=1).values + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def leaky_relu(p=0.1): + # return nn.LeakyReLU(p) + return nn.Identity() + + +def precompute_freqs_cis_2d(dim: int, end: int, H, W, theta: float = 10000.0, scale=1.0, use_cls=False): + # H = int( end**0.5 ) + assert H * W == end + flat_patch_pos = torch.arange(0 if not use_cls else -1, end) # N = end + x_pos = flat_patch_pos % H # N + y_pos = flat_patch_pos // H # N + freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) # Hc/4 + x_freqs = torch.outer(x_pos, freqs).float() # N Hc/4 + y_freqs = torch.outer(y_pos, freqs).float() # N Hc/4 + x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs) + y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs) + freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1) # N,Hc/4,2 + freqs_cis = freqs_cis.reshape(end if not use_cls else end + 1, -1) + # we need to think how to implement this for multi heads. + # freqs_cis = torch.cat([x_cis, y_cis], dim=-1) # N, Hc/2 + return freqs_cis + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + # x: B N H Hc/2 + # freqs_cis: N, H*Hc/2 or N Hc/2 + ndim = x.ndim + assert 0 <= 1 < ndim + + if freqs_cis.shape[-1] == x.shape[-1]: + shape = [1 if i == 2 or i == 0 else d for i, d in enumerate(x.shape)] # 1, N, 1, Hc/2 + else: + shape = [d if i != 0 else 1 for i, d in enumerate(x.shape)] # 1, N, H, Hc/2 + # B, N, Hc/2 + return freqs_cis.view(*shape) + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + # xq : B N H Hc + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) # B N H Hc/2 + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) # B, N, H, Hc + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +class Pooling(nn.Module): + def __init__(self, pool_type, dim): + super().__init__() + if pool_type == "a": + self.pool = nn.AvgPool2d(kernel_size=2) + + elif pool_type == "m": + self.pool = nn.MaxPool2d(kernel_size=2) + + elif pool_type == "l": + self.pool = nn.Linear(4 * dim, dim) + + else: + raise NotImplementedError + + self.pool_type = pool_type + + def forward(self, x): + # B N C + B, N, C= x.shape + if self.pool_type in ["a", "m"]: + H, W = int(math.sqrt(N)), int(math.sqrt(N)) + x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + x = self.pool(x) + x = x.view(B, C, -1).transpose(1, 2).contiguous() + + else: + x = x.view(B, N//4, -1) + x = self.pool(x) + + return x + + +class Up(nn.Module): + def __init__(self, up_type, dim): + super().__init__() + if up_type == "n": + self.up = nn.Upsample(scale_factor=2, mode='nearest') + + elif up_type == "r": + self.up = nn.Sequential( + nn.Upsample(scale_factor=2, mode='nearest'), + Rearrange('b c h w -> b (h w) c'), + nn.Linear(dim, dim) + ) + + else: + raise NotImplementedError + + self.up_type = up_type + + def forward(self, x): + # B N C + B, N, C= x.shape + if self.up_type == "n": + H, W = int(math.sqrt(N)), int(math.sqrt(N)) + x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous() + x = self.up(x) + x = x.view(B, C, -1).transpose(1, 2).contiguous() + + else: + #x = self.up(x) # B, N, 4c + #x = x.view(B, N * 4, -1) + H, W = int(math.sqrt(N)), int(math.sqrt(N)) + x = x.view(B, H, W, -1).permute(0, 3, 1, 2).contiguous() # B, C, H, W + x = self.up(x) # B, (2H 2W), C + + return x + + +class GEGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.gelu(gate) * x + + +def FeedForward(dim, mult=4, dropout=0.): + """ Check this paper to understand the computation: https://arxiv.org/pdf/2002.05202.pdf""" + inner_dim = int(mult * (2 / 3) * dim) + return nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, inner_dim * 2, bias=False), + GEGLU(), + nn.Dropout(dropout), + nn.Linear(inner_dim, dim, bias=False) + ) + +# PEG - position generating module + + + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + if isinstance(window_size, int): + window_size = (window_size, window_size) + + self.norm = LayerNorm(dim) + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + H, W = int(math.sqrt(N)), int(math.sqrt(N)) + x = self.norm(x) + + x = x.view(B_, H, W, -1) + # partition windows + x_windows = window_partition(x, self.window_size[0]) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size[0] * self.window_size[1], C) # nW*B, window_size*window_size, C + + BW, NW = x_windows.shape[:2] + + qkv = self.qkv(x_windows).reshape(BW, NW, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + attn = attn + relative_position_bias.unsqueeze(0) + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x_windows = (attn @ v).transpose(1, 2).reshape(BW, NW, C) + x_windows = self.proj(x_windows) + x_windows = self.proj_drop(x_windows) + + x = window_reverse(x_windows, self.window_size[0], H, W) # B H' W' C + x = x.view(B_, H * W, C) + + return x + + + + +class PEG(nn.Module): + def __init__(self, dim, causal=False): + super().__init__() + self.causal = causal + self.dsconv = nn.Conv3d(dim, dim, 3, groups=dim) + + @beartype + def forward(self, x, shape: Tuple[int, int, int, int] = None): + needs_shape = x.ndim == 3 + assert not (needs_shape and not exists(shape)) + + orig_shape = x.shape + if needs_shape: + x = x.reshape(*shape, -1) + + x = rearrange(x, 'b ... d -> b d ...') + + frame_padding = (2, 0) if self.causal else (1, 1) + + x = F.pad(x, (1, 1, 1, 1, *frame_padding), value=0.) + x = self.dsconv(x) + + x = rearrange(x, 'b d ... -> b ... d') + + if needs_shape: + x = rearrange(x, 'b ... d -> b (...) d') + + return x.reshape(orig_shape) + +# attention + + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_context=None, + dim_head=64, + heads=8, + causal=False, + norm_context=False, + dropout=0., + spatial_pos="rel", + mlp_block=False, + qk_norm=None + ): + super().__init__() + self.heads = heads + self.causal = causal + inner_dim = dim_head * heads + dim_context = default(dim_context, dim) + + # if spatial_pos == "rel": + # self.spatial_rel_pos_bias = ContinuousPositionBias(dim=dim, heads=heads) # HACK this: whether shared pos encoding is better or on the contrary + + self.spatial_pos = spatial_pos + self.freqs_cis = None + + # if causal: + # self.rel_pos_bias = AlibiPositionalBias(heads=heads) + + self.p_dropout = dropout + self.attn_dropout = nn.Dropout(dropout) + + self.norm = LayerNorm(dim) + self.context_norm = LayerNorm( + dim_context) if norm_context else nn.Identity() + + self.qk_norm = qk_norm + if qk_norm == "l2norm": + self.q_scale = nn.Parameter(torch.ones(dim_head)) + self.k_scale = nn.Parameter(torch.ones(dim_head)) + elif qk_norm == "rmsnorm": + self.q_norm = RMSNorm(dim_head) + self.k_norm = RMSNorm(dim_head) + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_kv = nn.Linear(dim_context, inner_dim * 2, bias=False) + self.dim = inner_dim + + # mlp branch + self.mlp_block = mlp_block + if mlp_block: + self.mlp_in = nn.Linear(dim, inner_dim) + self.mlp_gelu = nn.GELU() + self.mlp_out = nn.Linear(dim, inner_dim) + + self.to_out = nn.Linear(inner_dim, dim) + + def forward( + self, + x, + mask=None, + context=None, + is_spatial=True, + q_stride=1, + rope_cache=None, + upcast_attention=None + ): + batch, device, dtype = x.shape[0], x.device, x.dtype + + if exists(context): + context = self.context_norm(context) + + kv_input = default(context, x) + + x = self.norm(x) + N = x.shape[1] + + q, k, v = self.to_q(x), *self.to_kv(kv_input).chunk(2, dim=-1) + q, k, v = map(lambda t: rearrange( + t, 'b n (h d) -> b n h d', h=self.heads), (q, k, v)) + + if self.spatial_pos == "rope" and is_spatial and rope_cache != None: + q, k = apply_rotary_emb(q, k, freqs_cis=rope_cache) + + q, k, v = map(lambda t: rearrange( + t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + + B, H, _, D = q.shape + if q_stride > 1: + # Refer to Unroll to see how this performs a maxpool-Nd + q = ( + q.view(B, H, q_stride, -1, D) + .max(dim=2) + .values + ) + + if self.qk_norm == "l2norm": + q, k = map(l2norm, (q, k)) + q = q * self.q_scale + k = k * self.k_scale + elif self.qk_norm == "rmsnorm": + q = self.q_norm(q) + k = self.k_norm(k) + + if exists(mask): + mask = rearrange(mask, 'b j -> b 1 1 j') + + if q.shape[-2] == 1 and k.shape[-2] == 1 and v.shape[-2] == 1: + dummy_op = torch.sum(q) * 0 + torch.sum(k) * 0 # # incorporate a dummy operation to ensure q and k are used + out = v + dummy_op + else: + # print(q.dtype, k.dtype, v.dtype) + q = q.to(torch.float32) if "q" in upcast_attention else q + k = k.to(torch.float32) if "k" in upcast_attention else k + v = v.to(torch.float32) if "v" in upcast_attention else v + if is_dtype_16(q) or is_dtype_16(k) or is_dtype_16(v): + with sdpa_kernel(SDPBackend.FLASH_ATTENTION): + out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=self.p_dropout, is_causal=self.causal) + else: + out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=self.p_dropout, is_causal=self.causal) + + out = rearrange(out, 'b h n d -> b n (h d)') + + # mlp_block branch + if self.mlp_block: + mlp_x = self.mlp_in(x) + mlp_x = self.mlp_gelu(mlp_x) + mlp_out = self.mlp_out(mlp_x) + out = out + mlp_out + + return self.to_out(out) + + +# alibi positional bias for extrapolation +class AlibiPositionalBias(nn.Module): + def __init__(self, heads): + super().__init__() + self.heads = heads + slopes = torch.Tensor(self._get_slopes(heads)) + slopes = rearrange(slopes, 'h -> h 1 1') + self.register_buffer('slopes', slopes, persistent=False) + self.register_buffer('bias', None, persistent=False) + + def get_bias(self, i, j, device): + i_arange = torch.arange(j - i, j, device=device) + j_arange = torch.arange(j, device=device) + bias = -torch.abs(rearrange(j_arange, 'j -> 1 1 j') - + rearrange(i_arange, 'i -> 1 i 1')) + return bias + + @staticmethod + def _get_slopes(heads): + def get_slopes_power_of_2(n): + start = (2**(-2**-(math.log2(n)-3))) + ratio = start + return [start*ratio**i for i in range(n)] + + if math.log2(heads).is_integer(): + return get_slopes_power_of_2(heads) + + closest_power_of_2 = 2 ** math.floor(math.log2(heads)) + return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][:heads-closest_power_of_2] + + def forward(self, sim): + h, i, j, device = *sim.shape[-3:], sim.device + + if exists(self.bias) and self.bias.shape[-1] >= j: + return self.bias[..., :i, :j] + + bias = self.get_bias(i, j, device) + bias = bias * self.slopes + + num_heads_unalibied = h - bias.shape[0] + bias = F.pad(bias, (0, 0, 0, 0, 0, num_heads_unalibied)) + self.register_buffer('bias', bias, persistent=False) + + return self.bias + + +class ContinuousPositionBias(nn.Module): + """ from https://arxiv.org/abs/2111.09883 """ + + def __init__( + self, + *, + dim, + heads, + num_dims=2, # 2 for images, 3 for video + layers=2, + log_dist=True, + cache_rel_pos=False + ): + super().__init__() + self.num_dims = num_dims + self.log_dist = log_dist + + self.net = nn.ModuleList([]) + self.net.append(nn.Sequential( + nn.Linear(self.num_dims, dim), leaky_relu())) + + for _ in range(layers - 1): + self.net.append(nn.Sequential(nn.Linear(dim, dim), leaky_relu())) + + self.net.append(nn.Linear(dim, heads)) + + self.cache_rel_pos = cache_rel_pos + self.register_buffer('rel_pos', None, persistent=False) + + def forward(self, *dimensions, device=torch.device('cpu')): + + if not exists(self.rel_pos) or not self.cache_rel_pos: + positions = [torch.arange(d, device=device) for d in dimensions] + grid = torch.stack(torch.meshgrid(*positions, indexing='ij')) + grid = rearrange(grid, 'c ... -> (...) c') + rel_pos = rearrange(grid, 'i c -> i 1 c') - \ + rearrange(grid, 'j c -> 1 j c') + + if self.log_dist: + rel_pos = torch.sign(rel_pos) * torch.log(rel_pos.abs() + 1) + + self.register_buffer('rel_pos', rel_pos, persistent=False) + + rel_pos = self.rel_pos.float() + + for layer in self.net: + rel_pos = layer(rel_pos) + + return rearrange(rel_pos, 'i j h -> h i j') + +# transformer + + +class Transformer(nn.Module): + def __init__( + self, + dim, + *, + depth, + block, + dim_context=None, + causal=False, + dim_head=64, + heads=8, + ff_mult=4, + peg=False, + peg_causal=False, + has_cross_attn=False, + attn_dropout=0., + ff_dropout=0., + window_size=4, + spatial_pos="rel", + mlp_block=False, + upcast_attention=None, + qk_norm=None, + drop_path=0. + ): + super().__init__() + self.dim = dim + self.dim_head = dim_head + self.heads = heads + self.upcast_attention = upcast_attention + assert len(block) == depth + self.layers = nn.ModuleList([]) + dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] + for i in range(depth): + if block[i] == 't': + self.layers.append(nn.ModuleList([ + PEG(dim=dim, causal=peg_causal) if peg else None, + Attention(dim=dim, dim_head=dim_head, heads=heads, + causal=causal, dropout=attn_dropout, spatial_pos=spatial_pos, mlp_block=mlp_block, qk_norm=qk_norm), + Attention(dim=dim, dim_head=dim_head, dim_context=dim_context, heads=heads, causal=False, + dropout=attn_dropout, mlp_block=mlp_block, qk_norm=qk_norm) if has_cross_attn else None, + FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout), + DropPath(dpr[i]) if dpr[i] > 0. else nn.Identity() + ])) + + # elif block[i] == 'w': + # self.layers.append(nn.ModuleList([ + # None, + # WindowAttention(dim=dim, window_size=window_size, num_heads=heads, attn_drop=attn_dropout), + # None, + # FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) + # ])) + + # # various pooling methods: B, N, C + # elif block[i] in ['a', 'm', 'l']: + # self.layers.append(nn.ModuleList([ + # None, + # Pooling(block[i], dim), + # None, + # FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) + # ])) + + # elif block[i] in ['n', 'r']: + # self.layers.append(nn.ModuleList([ + # None, + # Up(block[i], dim), + # None, + # FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) + # ])) + + else: + raise NotImplementedError + + self.block = block + self.norm_out = nn.LayerNorm(dim) + + @beartype + def forward( + self, + x, + video_shape: Tuple[int, int, int, int] = None, + context=None, + self_attn_mask=None, + cross_attn_context_mask=None, + q_strides=None, + is_spatial=True + ): + if q_strides is None: + q_strides = '1' * len(self.layers) + + for blk, q_stride, (peg, self_attn, cross_attn, ff, drop_path) in zip(self.block, q_strides, self.layers): + if exists(peg): + with torch.amp.autocast("cuda", enabled=False): + x = peg(x, shape=video_shape) + x + + if isinstance(self_attn, Attention): + H, W = video_shape[2], video_shape[3] + if x.shape[-2] == H * W: + rope_cache = precompute_freqs_cis_2d(self.dim_head, x.shape[1], H, W).to(x.device) + elif x.shape[-2] == 1 or is_spatial == False: + rope_cache = None + else: + raise NotImplementedError + x = drop_path(self_attn( + x, mask=self_attn_mask, + q_stride=int(q_stride), is_spatial=is_spatial, + rope_cache=rope_cache, upcast_attention=self.upcast_attention + )) + do_pool(x, int(q_stride)) + + elif isinstance(self_attn, WindowAttention): + x = drop_path(self_attn(x)) + x + else: + x = self_attn(x) + + if exists(cross_attn) and exists(context): + x = cross_attn(x, context=context, + mask=cross_attn_context_mask) + x + + x = ff(x) + x + + # deal with downsampling: + if blk in ['a', 'm', 'l']: + video_shape = (video_shape[0], video_shape[1], video_shape[2]//2, video_shape[3]//2) # video_shape: B, T, H, W + + elif blk in ['n', 'r']: + video_shape = (video_shape[0], video_shape[1], int(video_shape[2]*2), int(video_shape[3]*2)) + + + if q_stride != '1': + down_ratio = int(math.sqrt(int(q_stride))) + video_shape = (video_shape[0], video_shape[1], video_shape[2]//down_ratio, video_shape[3]//down_ratio) + + return self.norm_out(x) diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/codebook.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/codebook.py new file mode 100644 index 0000000000000000000000000000000000000000..bd8f0aeb86643f4a210a1db9c5626e2fbf093b74 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/codebook.py @@ -0,0 +1,418 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +from enum import unique +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist + +from infinity.models.videovae.utils.misc import shift_dim + +class Codebook(nn.Module): + def __init__(self, n_codes, embedding_dim, no_random_restart=False, restart_thres=1.0, usage_sigma=0.99, fp32_quant=False): + super().__init__() + self.register_buffer('embeddings', torch.randn(n_codes, embedding_dim)) + self.register_buffer('N', torch.zeros(n_codes)) + self.register_buffer('z_avg', self.embeddings.data.clone()) + self.register_buffer('codebook_usage', torch.zeros(n_codes)) + + self.call_cnt = 0 + self.usage_sigma = usage_sigma + + self.n_codes = n_codes + self.embedding_dim = embedding_dim + self._need_init = True + self.no_random_restart = no_random_restart + self.restart_thres = restart_thres + + self.fp32_quant = fp32_quant + + def _tile(self, x): + d, ew = x.shape + if d < self.n_codes: + n_repeats = (self.n_codes + d - 1) // d + std = 0.01 / np.sqrt(ew) + x = x.repeat(n_repeats, 1) + x = x + torch.randn_like(x) * std + return x + + def _init_embeddings(self, z): + # z: [b, c, t, h, w] + self._need_init = False + flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2) + y = self._tile(flat_inputs) + + d = y.shape[0] + _k_rand = y[torch.randperm(y.shape[0])][:self.n_codes] + if dist.is_initialized(): + dist.broadcast(_k_rand, 0) + self.embeddings.data.copy_(_k_rand) + self.z_avg.data.copy_(_k_rand) + self.N.data.copy_(torch.ones(self.n_codes)) + + + def calculate_batch_codebook_usage_percentage(self, batch_encoding_indices): + # Flatten the batch of encoding indices into a single 1D tensor + all_indices = batch_encoding_indices.flatten() + + # Obtain the total number of encoding indices in the batch to calculate percentages + total_indices = all_indices.numel() + + # Initialize a tensor to store the percentage usage of each code + codebook_usage_percentage = torch.zeros(self.n_codes, device=all_indices.device) + + # Count the number of occurrences of each index and get their frequency as percentages + unique_indices, counts = torch.unique(all_indices, return_counts=True) + # Calculate the percentage + percentages = (counts.float() / total_indices) + + # Populate the corresponding percentages in the codebook_usage_percentage tensor + codebook_usage_percentage[unique_indices.long()] = percentages + + return codebook_usage_percentage + + + + def forward(self, z): + # z: [b, c, t, h, w] + if self._need_init and self.training: + self._init_embeddings(z) + flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2) # [bthw, c] + + distances = (flat_inputs ** 2).sum(dim=1, keepdim=True) \ + - 2 * flat_inputs @ self.embeddings.t() \ + + (self.embeddings.t() ** 2).sum(dim=0, keepdim=True) # [bthw, c] + + encoding_indices = torch.argmin(distances, dim=1) + encode_onehot = F.one_hot(encoding_indices, self.n_codes).type_as(flat_inputs) # [bthw, ncode] + encoding_indices = encoding_indices.view(z.shape[0], *z.shape[2:]) # [b, t, h, w, ncode] + + embeddings = F.embedding(encoding_indices, self.embeddings) # [b, t, h, w, c] + embeddings = shift_dim(embeddings, -1, 1) # [b, c, t, h, w] + + commitment_loss = 0.25 * F.mse_loss(z, embeddings.detach()) + + # EMA codebook update + if self.training: + n_total = encode_onehot.sum(dim=0) + encode_sum = flat_inputs.t() @ encode_onehot + if dist.is_initialized(): + dist.all_reduce(n_total) + dist.all_reduce(encode_sum) + + self.N.data.mul_(0.99).add_(n_total, alpha=0.01) + self.z_avg.data.mul_(0.99).add_(encode_sum.t(), alpha=0.01) + + n = self.N.sum() + weights = (self.N + 1e-7) / (n + self.n_codes * 1e-7) * n + encode_normalized = self.z_avg / weights.unsqueeze(1) + self.embeddings.data.copy_(encode_normalized) + + y = self._tile(flat_inputs) + _k_rand = y[torch.randperm(y.shape[0])][:self.n_codes] + if dist.is_initialized(): + dist.broadcast(_k_rand, 0) + + if not self.no_random_restart: + usage = (self.N.view(self.n_codes, 1) >= self.restart_thres).float() + self.embeddings.data.mul_(usage).add_(_k_rand * (1 - usage)) + + embeddings_st = (embeddings - z).detach() + z + + avg_probs = torch.mean(encode_onehot, dim=0) + perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10))) + + try: + usage = self.calculate_batch_codebook_usage_percentage(encoding_indices) + except: + usage = torch.zeros(self.n_codes, device=encoding_indices.device) + + + # print(usage.shape, torch.zeros(self.n_codes).shape) + + if self.call_cnt == 0: + self.codebook_usage.data = usage + else: + self.codebook_usage.data = self.usage_sigma * self.codebook_usage.data + (1 - self.usage_sigma) * usage + + self.call_cnt += 1 + # avg_distribution = self.codebook_usage.data.sum() / self.n_codes + avg_usage = (self.codebook_usage.data > (1/self.n_codes)).sum() / self.n_codes + + return dict(embeddings=embeddings_st, encodings=encoding_indices, + commitment_loss=commitment_loss, perplexity=perplexity, avg_usage=avg_usage, batch_usage=usage) + + def dictionary_lookup(self, encodings): + embeddings = F.embedding(encodings, self.embeddings) + return embeddings + + +# Multi-scale Codebook +from typing import List, Optional, Tuple, Sequence, Union + + +class ResConvAfterUpsample(nn.Conv3d): + def __init__(self, embed_dim, quant_resi): + ks = 3 if quant_resi < 0 else 1 + super().__init__(in_channels=embed_dim, out_channels=embed_dim, kernel_size=ks, stride=1, padding=ks//2) + self.resi_ratio = abs(quant_resi) + + def forward(self, h_BCthw): + return h_BCthw.mul(1-self.resi_ratio) + super().forward(h_BCthw).mul_(self.resi_ratio) + + +class SharedResConvAfterUpsample(nn.Module): + def __init__(self, qresi: ResConvAfterUpsample): + super().__init__() + self.qresi: ResConvAfterUpsample = qresi + + def __getitem__(self, _) -> ResConvAfterUpsample: + return self.qresi + + +class ResConvAfterUpsampleList(nn.Module): + def __init__(self, qresi_ls: nn.ModuleList): + super().__init__() + self.qresi_ls = qresi_ls + K = len(qresi_ls) + self.ticks = np.linspace(1/3/K, 1-1/3/K, K) if K == 4 else np.linspace(1/2/K, 1-1/2/K, K) + + def __getitem__(self, at_from_0_to_1: float) -> ResConvAfterUpsample: + return self.qresi_ls[np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()] + + def extra_repr(self) -> str: + return f'ticks={self.ticks}' + + +class ResConvAfterUpsampleModuleList(nn.ModuleList): + def __init__(self, qresi: List): + super().__init__(qresi) + # self.qresi = qresi + K = len(qresi) + self.ticks = np.linspace(1/3/K, 1-1/3/K, K) if K == 4 else np.linspace(1/2/K, 1-1/2/K, K) + + def __getitem__(self, at_from_0_to_1: float) -> ResConvAfterUpsample: + return super().__getitem__(np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()) + + def extra_repr(self) -> str: + return f'ticks={self.ticks}' + +class MultiScaleCodebook(nn.Module): + def __init__(self, n_codes, + embedding_dim, no_random_restart=False, + restart_thres=1.0, usage_sigma=0.99, fp32_quant=False, + quant_resi = -0.5, share_quant_resi = 4, default_qresi_counts = 10, + t_patch_nums = (1, 1, 2, 2, 2, 4, 4, 4, 4, 4), + v_patch_nums = (1, 2, 3, 4, 5, 6, 8, 10, 13, 16), + ): + super().__init__() + self.register_buffer('embeddings', torch.randn(n_codes, embedding_dim)) + self.register_buffer('N', torch.zeros(n_codes)) + self.register_buffer('z_avg', self.embeddings.data.clone()) + self.register_buffer('codebook_usage', torch.zeros(n_codes)) + + self.call_cnt = 0 + self.usage_sigma = usage_sigma + + self.n_codes = n_codes + self.embedding_dim = embedding_dim + self._need_init = True + self.no_random_restart = no_random_restart + self.restart_thres = restart_thres + + self.fp32_quant = fp32_quant + + # quant resi + + self.t_patch_nums = t_patch_nums + self.v_patch_nums = v_patch_nums + self.quant_resi_ratio = quant_resi + + if share_quant_resi == 1: # args.qsr + self.quant_resi = SharedResConvAfterUpsample(ResConvAfterUpsample(embedding_dim, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()) + elif share_quant_resi == 0: + self.quant_resi = ResConvAfterUpsampleModuleList([(ResConvAfterUpsample(embedding_dim, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()) for _ in range(default_qresi_counts or len(self.v_patch_nums))]) + else: + self.quant_resi = ResConvAfterUpsampleList(nn.ModuleList([(ResConvAfterUpsample(embedding_dim, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()) for _ in range(share_quant_resi)])) + + self.z_interplote_down = 'area' + self.z_interplote_up = 'trilinear' + + + + def _tile(self, x): + d, ew = x.shape + if d < self.n_codes: + n_repeats = (self.n_codes + d - 1) // d + std = 0.01 / np.sqrt(ew) + x = x.repeat(n_repeats, 1) + x = x + torch.randn_like(x) * std + return x + + def _init_embeddings(self, z): + # z: [b, c, t, h, w] + self._need_init = False + flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2) + y = self._tile(flat_inputs) + + d = y.shape[0] + _k_rand = y[torch.randperm(y.shape[0])][:self.n_codes] + if dist.is_initialized(): + dist.broadcast(_k_rand, 0) + self.embeddings.data.copy_(_k_rand) + self.z_avg.data.copy_(_k_rand) + self.N.data.copy_(torch.ones(self.n_codes)) + + + def calculate_batch_codebook_usage_percentage(self, batch_encoding_indices): + # Flatten the batch of encoding indices into a single 1D tensor + all_indices = batch_encoding_indices.flatten() + + # Obtain the total number of encoding indices in the batch to calculate percentages + total_indices = all_indices.numel() + + # Initialize a tensor to store the percentage usage of each code + codebook_usage_percentage = torch.zeros(self.n_codes, device=all_indices.device) + + # Count the number of occurrences of each index and get their frequency as percentages + unique_indices, counts = torch.unique(all_indices, return_counts=True) + # Calculate the percentage + percentages = (counts.float() / total_indices) + + # Populate the corresponding percentages in the codebook_usage_percentage tensor + codebook_usage_percentage[unique_indices.long()] = percentages + + return codebook_usage_percentage + + + + def forward(self, z): + # z: [b, c, t, h, w] + if self._need_init and self.training: + self._init_embeddings(z) + + # 永远维持THW的结构,差最近邻时候flat,然后会进行quant_res + B, C, T, H, W = z.shape + + z_no_grad = z.detach() + accu_h = torch.zeros_like(z_no_grad) + + + if self.training: + all_flat_inputs, all_encode_onehot = [], [] + + commitment_loss = 0.0 + scale_num = len(self.v_patch_nums) + ms_encoding_indices = [] + + + with torch.cuda.amp.autocast(enabled=False): + + for si, (tpn, pn) in enumerate(zip(self.t_patch_nums, self.v_patch_nums)): + tpn = min(tpn, T) + + # latents + rest_z = z_no_grad - accu_h.data + + if si != scale_num - 1: # z进行下采样 + rest_z = F.interpolate(rest_z, size=(tpn, pn, pn), mode=self.z_interplote_down) + + z_NC = rest_z.permute(0, 2, 3, 4, 1).reshape(-1, C) + + # 这个尺度的 rest_z 与 codebook的 distances + d_no_grad = torch.sum(z_NC.square(), dim=1, keepdim=True) + torch.sum(self.embeddings.square(), dim=1, keepdim=False) + d_no_grad.addmm_(z_NC, self.embeddings.t(), alpha=-2, beta=1) + + # 转成离散ids + encoding_indices = torch.argmin(d_no_grad, dim=1) + encode_onehot = F.one_hot(encoding_indices, self.n_codes).type_as(z_NC) # [bthw, ncode] + encoding_indices = encoding_indices.view(rest_z.shape[0], *rest_z.shape[2:]) # [b, t, h, w, ncode] + + ms_encoding_indices.append(encoding_indices) + + # id转回连续,用h_表述 + h_BTHWC = F.embedding(encoding_indices, self.embeddings) # [b, t, h, w, c] + h_BCTHW = h_BTHWC.permute(0, 4, 1, 2, 3).contiguous() # [b, c, t, h, w] + + # up & quant resi + + h_BCTHW = F.interpolate(h_BCTHW, size=(T, H, W), mode=self.z_interplote_up).contiguous() + + # 加一个quant resi做卷积运算 + quant_head = si / max(1, (scale_num - 1)) + h_BCTHW = self.quant_resi[quant_head](h_BCTHW) + + # h累加 + accu_h = accu_h + h_BCTHW + + commitment_loss += 0.25 * F.mse_loss(accu_h, z.detach()) # 0.25是一个beta + + if self.training: + all_flat_inputs.append(z_NC) + all_encode_onehot.append(encode_onehot) + + if self.training: + + encode_onehot = torch.cat(all_encode_onehot, dim=0) + flat_inputs = torch.cat(all_flat_inputs, dim=0) + + n_total = encode_onehot.sum(dim=0) + encode_sum = flat_inputs.t() @ encode_onehot + if dist.is_initialized(): + dist.all_reduce(n_total) + dist.all_reduce(encode_sum) + + self.N.data.mul_(0.99).add_(n_total, alpha=0.01) + self.z_avg.data.mul_(0.99).add_(encode_sum.t(), alpha=0.01) + + n = self.N.sum() + weights = (self.N + 1e-7) / (n + self.n_codes * 1e-7) * n + encode_normalized = self.z_avg / weights.unsqueeze(1) + self.embeddings.data.copy_(encode_normalized) + + y = self._tile(flat_inputs) + _k_rand = y[torch.randperm(y.shape[0])][:self.n_codes] + if dist.is_initialized(): + dist.broadcast(_k_rand, 0) + + if not self.no_random_restart: + usage = (self.N.view(self.n_codes, 1) >= self.restart_thres).float() + self.embeddings.data.mul_(usage).add_(_k_rand * (1 - usage)) + + commitment_loss *= 1.0 / scale_num + embeddings_st = (accu_h - z_no_grad).detach() + z + + avg_probs = torch.mean(encode_onehot, dim=0) + perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10))) + + try: + usage = self.calculate_batch_codebook_usage_percentage(encoding_indices) + except: + usage = torch.zeros(self.n_codes, device=encoding_indices.device) + + + # print(usage.shape, torch.zeros(self.n_codes).shape) + + if self.call_cnt == 0: + self.codebook_usage.data = usage + else: + self.codebook_usage.data = self.usage_sigma * self.codebook_usage.data + (1 - self.usage_sigma) * usage + + self.call_cnt += 1 + # avg_distribution = self.codebook_usage.data.sum() / self.n_codes + avg_usage = (self.codebook_usage.data > (1/self.n_codes)).sum() / self.n_codes + + # print(f"training: {embeddings_st.size()=}, {encoding_indices.size()=}") + # for idx, en_idx in enumerate(ms_encoding_indices): + # print(f"{idx=}, {en_idx.size()=}", flush=True) + + return dict(embeddings=embeddings_st, encodings=ms_encoding_indices, + commitment_loss=commitment_loss, perplexity=perplexity, avg_usage=avg_usage, batch_usage=usage) + + def dictionary_lookup(self, encodings): + embeddings = F.embedding(encodings, self.embeddings) + return embeddings + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/commitments.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/commitments.py new file mode 100644 index 0000000000000000000000000000000000000000..56b042c6c77291707c38198df68670a0c144b82a --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/commitments.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) + + def sample(self): + # x = self.mean + self.std * torch.randn(self.mean.shape).to(device) + x = self.mean + self.std * torch.randn(self.mean.shape, device=self.parameters.device) + return x + + def kl(self, other=None, reduction="sum"): + if reduction == "sum": + reduction_op = torch.sum + elif reduction == "mean": + reduction_op = torch.mean + if self.mean.ndim == 4: + dims = [1,2,3] + else: + dims = [1,2,3,4] + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * reduction_op(torch.pow(self.mean, 2) + + self.var - 1.0 - self.logvar, + dim=dims) + else: + return 0.5 * reduction_op( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=dims) + + def nll(self, sample, dims=[1,2,3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + torch.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) + ) + +class VectorQuantizer(nn.Module): + def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage): + super().__init__() + self.n_e = n_e + self.e_dim = e_dim + self.beta = beta + self.entropy_loss_ratio = entropy_loss_ratio + self.l2_norm = l2_norm + self.show_usage = show_usage + + self.embedding = nn.Embedding(self.n_e, self.e_dim) + self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e) + if self.l2_norm: + self.embedding.weight.data = F.normalize(self.embedding.weight.data, p=2, dim=-1) + if self.show_usage: + self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536))) + + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + z = torch.einsum('b c h w -> b h w c', z).contiguous() + z_flattened = z.view(-1, self.e_dim) + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + + if self.l2_norm: + z = F.normalize(z, p=2, dim=-1) + z_flattened = F.normalize(z_flattened, p=2, dim=-1) + embedding = F.normalize(self.embedding.weight, p=2, dim=-1) + else: + embedding = self.embedding.weight + + d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \ + torch.sum(embedding**2, dim=1) - 2 * \ + torch.einsum('bd,dn->bn', z_flattened, torch.einsum('n d -> d n', embedding)) + + min_encoding_indices = torch.argmin(d, dim=1) + z_q = embedding[min_encoding_indices].view(z.shape) + perplexity = None + min_encodings = None + vq_loss = None + commit_loss = None + entropy_loss = None + codebook_usage = 0 + + if self.show_usage and self.training: + cur_len = min_encoding_indices.shape[0] + self.codebook_used[:-cur_len] = self.codebook_used[cur_len:].clone() + self.codebook_used[-cur_len:] = min_encoding_indices + codebook_usage = len(torch.unique(self.codebook_used)) / self.n_e + + # compute loss for embedding + if self.training: + vq_loss = torch.mean((z_q - z.detach()) ** 2) + commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # reshape back to match original input shape + z_q = torch.einsum('b h w c -> b c h w', z_q) + + return z_q, (vq_loss, commit_loss, entropy_loss, codebook_usage), (perplexity, min_encodings, min_encoding_indices) + + def get_codebook_entry(self, indices, shape=None, channel_first=True): + # shape = (batch, channel, height, width) if channel_first else (batch, height, width, channel) + if self.l2_norm: + embedding = F.normalize(self.embedding.weight, p=2, dim=-1) + else: + embedding = self.embedding.weight + z_q = embedding[indices] # (b*h*w, c) + + if shape is not None: + if channel_first: + z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1]) + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + else: + z_q = z_q.view(shape) + return z_q + +def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01): + flat_affinity = affinity.reshape(-1, affinity.shape[-1]) + flat_affinity /= temperature + probs = F.softmax(flat_affinity, dim=-1) + log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1) + if loss_type == "softmax": + target_probs = probs + else: + raise ValueError("Entropy loss {} not supported".format(loss_type)) + avg_probs = torch.mean(target_probs, dim=0) + avg_entropy = - torch.sum(avg_probs * torch.log(avg_probs + 1e-5)) + sample_entropy = - torch.mean(torch.sum(target_probs * log_probs, dim=-1)) + loss = sample_entropy - avg_entropy + return loss \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/conv.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..645eb05ecdca5b63e7653746d972aa35cdc09931 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/conv.py @@ -0,0 +1,490 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from typing import Dict, Optional, Tuple, Union +import torch +import torch.nn as nn +from einops import rearrange, repeat +import torch.nn.functional as F +from .misc import swish +from infinity.models.videovae.modules.normalization import SpatialGroupNorm +from infinity.models.videovae.utils.context_parallel import ContextParallelUtils as cp +from infinity.models.videovae.utils.context_parallel import dist_conv_cache_send, dist_conv_cache_recv + + +class DCDownBlock3d(nn.Module): + def __init__(self, + in_channels: int, + out_channels: int, + shortcut: bool = True, + group_norm=False, + compress_time=False, + ) -> None: + super().__init__() + self.shortcut = shortcut + self.compress_time = compress_time + if group_norm: + self.norm = SpatialGroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.spatial_factor = 2 + self.temporal_factor = int(compress_time) if compress_time else 1 + out_ratio = self.spatial_factor**2 + assert out_channels % out_ratio == 0 + out_channels = out_channels // out_ratio + + # self.conv = nn.Conv3d( + # in_channels, + # out_channels, + # kernel_size=3, + # stride=(1, 1, 1), + # padding=0, + # ) + self.conv = CogVideoXCausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode="first") + + def forward(self, hidden_states: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None, temporal_compress = True) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + x = hidden_states + x = self.nonlinearity(self.norm(x)) + assert x.ndim == 5, f"x.ndim must be (B C T H W)" + + ### use nn.Conv3d + # x = F.pad(x, (1, 1, 1, 1, 2, 0)) # causal pad (left, right, top, bottom, front, back) + # x[:, :, :2, 1:-1, 1:-1] = x[:, :, 2:3, 1:-1, 1:-1].clone() # broadcast the first value + # x = self.conv(x) + + ### use CogVideoXCausalConv3d + x, new_conv_cache["conv"] = self.conv(x, conv_cache=conv_cache.get("conv")) + + if x.shape[2] > 1: + if x.shape[2] % 2 == 1: + x_first, x_rest = x[:, :, 0, ...], x[:, :, 1:, ...] + y_first, y_rest = hidden_states[:, :, 0, ...], hidden_states[:, :, 1:, ...] + else: + x_first, x_rest = None, x + y_first, y_rest = None, hidden_states + elif x.shape[2] == 1: + x_first, x_rest = x[:, :, 0, ...], None + y_first, y_rest = hidden_states[:, :, 0, ...], None + else: + raise NotImplementedError + if x_first is not None: + x_first = rearrange(x_first, "b c (h ph) (w pw) -> b (ph pw c) h w", ph=self.spatial_factor, pw=self.spatial_factor) + y_first = rearrange(y_first, "b c (h ph) (w pw) -> b (ph pw c) h w", ph=self.spatial_factor, pw=self.spatial_factor) + if x_rest is not None: + if temporal_compress: + x_rest = rearrange(x_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) t pt h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + x_rest = x_rest.mean(dim=3) + y_rest = rearrange(y_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) t pt h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + y_rest = y_rest.mean(dim=3) + else: + x_rest = rearrange(x_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) (t pt) h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + y_rest = rearrange(y_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) (t pt) h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + if x_first is not None and x_rest is not None: + x = torch.cat([x_first[:,:, None,...], x_rest], dim=2) + y = torch.cat([y_first[:,:, None,...], y_rest], dim=2) + else: + x = x_first[:,:, None,...] if x_first is not None else x_rest + y = y_first[:,:, None,...] if y_first is not None else y_rest + if self.shortcut: + y = rearrange(y, "b (g c) t h w -> b g c t h w", c=x.shape[1]).mean(dim=1) + hidden_states = x + y + else: + hidden_states = x + return hidden_states, new_conv_cache + +class DCUpBlock3d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + shortcut: bool = True, + interpolation_mode: str = "nearest", + group_norm=False, + compress_time=False + ) -> None: + super().__init__() + + self.compress_time = compress_time + if group_norm: + self.norm = SpatialGroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.interpolation_mode = interpolation_mode + self.shortcut = shortcut + self.spatial_factor = 2 + self.temporal_factor = int(compress_time) if compress_time else 1 + out_channels = out_channels * self.spatial_factor**2 * self.temporal_factor + # self.conv = nn.Conv3d(in_channels, out_channels, 3, (1, 1, 1), 0) + self.conv = CogVideoXCausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode="first") + assert out_channels % in_channels == 0 + self.repeats = out_channels // in_channels + + def forward(self, hidden_states: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + x = hidden_states + x = self.nonlinearity(self.norm(x)) + + compress_first = False + if x.shape[2] % 2 == 1: + compress_first = True + + ### use nn.Conv3d + # x = F.pad(x, (1, 1, 1, 1, 2, 0)) # causal pad (left, right, top, bottom, front, back) + # x[:, :, :2, 1:-1, 1:-1] = x[:, :, 2:3, 1:-1, 1:-1].clone() # broadcast the first value + # x = self.conv(x) + + ### use CogVideoXCausalConv3d + x, new_conv_cache["conv"] = self.conv(x, conv_cache=conv_cache.get("conv")) + + x = rearrange(x, "b (pt ph pw c) t h w -> b c (t pt) (h ph) (w pw)", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + y = repeat(hidden_states, "b c t h w -> b (r c) t h w", r=self.repeats) + y = rearrange(y, "b (pt ph pw c) t h w -> b c (t pt) (h ph) (w pw)", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + + # convert pt+pt*n -> 1+pt*n + if self.temporal_factor > 1 and compress_first: + if x.shape[2] > 1: + x_first, x_rest = x[:, :, :self.temporal_factor, ...], x[:, :, self.temporal_factor:, ...] + y_first, y_rest = y[:, :, :self.temporal_factor, ...], y[:, :, self.temporal_factor:, ...] + elif x.shape[2] == 1: + assert x.shape[2] == y.shape[2] == self.temporal_factor + x_first, x_rest = x, None + y_first, y_rest = y, None + else: + raise NotImplementedError + x = torch.cat([x_first.mean(dim=2, keepdim=True), x_rest], dim=2) + y = torch.cat([y_first.mean(dim=2, keepdim=True), y_rest], dim=2) + if self.shortcut: + hidden_states = x + y + else: + hidden_states = x + return hidden_states, new_conv_cache + +class DCDownBlock2d(nn.Module): + def __init__(self, + in_channels: int, + out_channels: int, + downsample: bool = False, + shortcut: bool = True, + group_norm=False, + ) -> None: + super().__init__() + if group_norm: + self.norm = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.downsample = downsample + self.factor = 2 + self.stride = 1 if downsample else 2 + self.group_size = in_channels * self.factor**2 // out_channels + self.shortcut = shortcut + + out_ratio = self.factor**2 + if downsample: + assert out_channels % out_ratio == 0 + out_channels = out_channels // out_ratio + + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=self.stride, + padding=1, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self.nonlinearity(self.norm(hidden_states)) + x = self.conv(x) + if self.downsample: + x = F.pixel_unshuffle(x, self.factor) + + if self.shortcut: + y = F.pixel_unshuffle(hidden_states, self.factor) + y = y.unflatten(1, (-1, self.group_size)) + y = y.mean(dim=2) + hidden_states = x + y + else: + hidden_states = x + + return hidden_states + +class DCUpBlock2d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + interpolate: bool = False, + shortcut: bool = True, + interpolation_mode: str = "nearest", + group_norm=False, + ) -> None: + super().__init__() + + if group_norm: + self.norm = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.interpolate = interpolate + self.interpolation_mode = interpolation_mode + self.shortcut = shortcut + self.factor = 2 + self.repeats = out_channels * self.factor**2 // in_channels + + out_ratio = self.factor**2 + + if not interpolate: + out_channels = out_channels * out_ratio + + self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self.nonlinearity(self.norm(hidden_states)) + if self.interpolate: + x = F.interpolate(x, scale_factor=self.factor, mode=self.interpolation_mode) + x = self.conv(x) + else: + x = self.conv(x) + x = F.pixel_shuffle(x, self.factor) + + if self.shortcut: + y = hidden_states.repeat_interleave(self.repeats, dim=1) + y = F.pixel_shuffle(y, self.factor) + hidden_states = x + y + else: + hidden_states = x + + return hidden_states + +class CogVideoXSafeConv3d(nn.Conv3d): + r""" + A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model. + """ + + def forward(self, input: torch.Tensor) -> torch.Tensor: + memory_count = ( + (input.shape[1] * input.shape[2] * input.shape[3] * input.shape[4]) * 2 / 1024**3 + ) + + # Set to 2GB, suitable for CuDNN + if memory_count > 2: + kernel_size = self.kernel_size[0] + part_num = int(memory_count / 2) + 1 + input_chunks = torch.chunk(input, part_num, dim=2) + + if kernel_size > 1: + input_chunks = [input_chunks[0]] + [ + torch.cat((input_chunks[i - 1][:, :, -kernel_size + 1 :], input_chunks[i]), dim=2) + for i in range(1, len(input_chunks)) + ] + + output_chunks = [] + for input_chunk in input_chunks: + output_chunks.append(super().forward(input_chunk)) + output = torch.cat(output_chunks, dim=2) + return output + else: + return super().forward(input) + + +class CogVideoXCausalConv3d(nn.Module): + r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model. + + Args: + in_channels (`int`): Number of channels in the input tensor. + out_channels (`int`): Number of output channels produced by the convolution. + kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel. + stride (`int`, defaults to `1`): Stride of the convolution. + dilation (`int`, defaults to `1`): Dilation rate of the convolution. + pad_mode (`str`, defaults to `"constant"`): Padding mode. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int, int]], + stride: int = 1, + dilation: int = 1, + pad_mode: str = "constant", + ): + super().__init__() + + if isinstance(kernel_size, int): + kernel_size = (kernel_size,) * 3 + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + self.pad_mode = pad_mode + time_pad = dilation * (time_kernel_size - 1) + (1 - stride) + height_pad = height_kernel_size // 2 + width_pad = width_kernel_size // 2 + + self.height_pad = height_pad + self.width_pad = width_pad + self.time_pad = time_pad + self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0) + + self.temporal_dim = 2 + self.time_kernel_size = time_kernel_size + + stride = (stride, 1, 1) + dilation = (dilation, 1, 1) + self.conv = CogVideoXSafeConv3d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + ) + + def fake_context_parallel_forward( + self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None + ) -> torch.Tensor: + kernel_size = self.time_kernel_size + + if cp.cp_on(): + conv_cache = dist_conv_cache_recv() + + if kernel_size > 1: + cached_inputs = [conv_cache.to(inputs.device)] if conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1) + inputs = torch.cat(cached_inputs + [inputs], dim=2) + return inputs + + def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None) -> torch.Tensor: + inputs = self.fake_context_parallel_forward(inputs, conv_cache) + + if cp.cp_on(): + dist_conv_cache_send(inputs[:, :, -self.time_kernel_size + 1 :]) + else: + conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() + + + padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad) + inputs = F.pad(inputs, padding_2d, mode="constant", value=0) + + output = self.conv(inputs) + return output, conv_cache + +class FluxConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, cnn_type="2d", cnn_slice_seq_len=17, causal_offset=0, temporal_down=False): + super().__init__() + self.cnn_type = cnn_type + self.slice_seq_len = cnn_slice_seq_len + + if cnn_type == "2d": + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding) + if cnn_type == "3d": + if temporal_down == False: + stride = (1, stride, stride) + else: + stride = (stride, stride, stride) + self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=0) + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size, kernel_size) + self.padding = ( + kernel_size[0] - 1 + causal_offset, # Temporal causal padding + padding, # Height padding + padding # Width padding + ) + self.causal_offset = causal_offset + self.stride = stride + self.kernel_size = kernel_size + + def forward(self, x): + if self.cnn_type == "2d": + if type(x) == list: + for i in range(len(x)): + x[i] = self.forward(x[i]) + return x + if x.ndim == 5: + B, C, T, H, W = x.shape + x = rearrange(x, "B C T H W -> (B T) C H W") + x = self.conv(x) + x = rearrange(x, "(B T) C H W -> B C T H W", T=T) + return x + else: + return self.conv(x) + if self.cnn_type == "3d": + if x.ndim == 5: + assert self.stride[0] == 1 or self.stride[0] == 2, f"only temporal stride = 1 or 2 are supported" + if self.stride[0] == 1: + for i in reversed(range(0, x.shape[2], self.slice_seq_len+self.stride[0]-1)): + st = i + en = min(i+self.slice_seq_len, x.shape[2]) + _x = x[:,:,st:en,:,:] + if i == 0: + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + self.padding[0], 0)) # Temporal + _x[:,:,:self.padding[0], + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,0:1,:,:].clone() # broadcast the first value + else: + padding_0 = self.kernel_size[0] - 1 + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + padding_0, 0)) # Temporal + _x[:,:,:padding_0, + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,i-padding_0:i,:,:].clone() + try: + _x = self.conv(_x) + except: + xs = [_x[:,:,:,:,i-1:i+2] for i in range(1,_x.shape[-1]-1)] + for i in range(len(xs)): + xs[i] = self.conv(xs[i]) + _x = torch.cat(xs, dim=-1) + if i == 0: + x[:,:,st-self.causal_offset:en,:,:] = _x + x = x[:,:,1:,:,:] + else: + x[:,:,st:en,:,:] = _x + else: + xs = [] + for i in range(0, x.shape[2], self.slice_seq_len+self.stride[0]-1): + st = i + en = min(i+self.slice_seq_len, x.shape[2]) + _x = x[:,:,st:en,:,:] + if i == 0: + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + self.padding[0], 0)) # Temporal + _x[:,:,:self.padding[0], + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,0:1,:,:].clone() # broadcast the first value + else: + padding_0 = self.kernel_size[0] - 1 + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + padding_0, 0)) # Temporal + _x[:,:,:padding_0, + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,i-padding_0:i,:,:].clone() + _x = self.conv(_x) + xs.append(_x) + try: + x = torch.cat(xs, dim=2) + except: + device = x.device + del x + xs = [_x.cpu().pin_memory() for _x in xs] + torch.cuda.empty_cache() + x = torch.cat([_x for _x in xs], dim=2).to(device=device) + else: + x = F.pad(x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1])) # Height + weight = torch.sum(self.conv.weight, dim=2) + bias = self.conv.bias + x = F.conv2d(x, weight=weight, bias=bias,stride=self.conv.stride[1:]) + return x diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/conv_wan.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/conv_wan.py new file mode 100644 index 0000000000000000000000000000000000000000..63ac63aeb8578e8670d51f2e4e90d84e7961e391 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/conv_wan.py @@ -0,0 +1,503 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from typing import Dict, Optional, Tuple, Union +import torch +import torch.nn as nn +from einops import rearrange, repeat +import torch.nn.functional as F +from .misc import swish +from infinity.models.videovae.modules.normalization_wan import get_norm +from infinity.models.videovae.utils.context_parallel import ContextParallelUtils as cp +from infinity.models.videovae.utils.context_parallel import dist_conv_cache_send, dist_conv_cache_recv + + +class DCDownBlock3d(nn.Module): + def __init__(self, + in_channels: int, + out_channels: int, + shortcut: bool = True, + group_norm=False, + compress_time=False, + norm_type=None, + pad_mode="constant", + ) -> None: + super().__init__() + self.shortcut = shortcut + self.compress_time = compress_time + if group_norm: + norm_layer = get_norm(norm_type) + self.norm = norm_layer(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.spatial_factor = 2 + self.temporal_factor = int(compress_time) if compress_time else 1 + out_ratio = self.spatial_factor**2 + assert out_channels % out_ratio == 0 + out_channels = out_channels // out_ratio + + # self.conv = nn.Conv3d( + # in_channels, + # out_channels, + # kernel_size=3, + # stride=(1, 1, 1), + # padding=0, + # ) + self.conv = CogVideoXCausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode) + + def forward(self, hidden_states: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None, temporal_compress = True) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + x = hidden_states + x = self.nonlinearity(self.norm(x)) + assert x.ndim == 5, f"x.ndim must be (B C T H W)" + + ### use nn.Conv3d + # x = F.pad(x, (1, 1, 1, 1, 2, 0)) # causal pad (left, right, top, bottom, front, back) + # x[:, :, :2, 1:-1, 1:-1] = x[:, :, 2:3, 1:-1, 1:-1].clone() # broadcast the first value + # x = self.conv(x) + + ### use CogVideoXCausalConv3d + x, new_conv_cache["conv"] = self.conv(x, conv_cache=conv_cache.get("conv")) + + if x.shape[2] > 1: + if x.shape[2] % 2 == 1: + x_first, x_rest = x[:, :, 0, ...], x[:, :, 1:, ...] + y_first, y_rest = hidden_states[:, :, 0, ...], hidden_states[:, :, 1:, ...] + else: + x_first, x_rest = None, x + y_first, y_rest = None, hidden_states + elif x.shape[2] == 1: + x_first, x_rest = x[:, :, 0, ...], None + y_first, y_rest = hidden_states[:, :, 0, ...], None + else: + raise NotImplementedError + if x_first is not None: + x_first = rearrange(x_first, "b c (h ph) (w pw) -> b (ph pw c) h w", ph=self.spatial_factor, pw=self.spatial_factor) + y_first = rearrange(y_first, "b c (h ph) (w pw) -> b (ph pw c) h w", ph=self.spatial_factor, pw=self.spatial_factor) + if x_rest is not None: + if temporal_compress: + x_rest = rearrange(x_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) t pt h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + x_rest = x_rest.mean(dim=3) + y_rest = rearrange(y_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) t pt h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + y_rest = y_rest.mean(dim=3) + else: + x_rest = rearrange(x_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) (t pt) h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + y_rest = rearrange(y_rest, "b c (t pt) (h ph) (w pw) -> b (ph pw c) (t pt) h w", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + if x_first is not None and x_rest is not None: + x = torch.cat([x_first[:,:, None,...], x_rest], dim=2) + y = torch.cat([y_first[:,:, None,...], y_rest], dim=2) + else: + x = x_first[:,:, None,...] if x_first is not None else x_rest + y = y_first[:,:, None,...] if y_first is not None else y_rest + if self.shortcut: + y = rearrange(y, "b (g c) t h w -> b g c t h w", c=x.shape[1]).mean(dim=1) + hidden_states = x + y + else: + hidden_states = x + return hidden_states, new_conv_cache + +class DCUpBlock3d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + shortcut: bool = True, + interpolation_mode: str = "nearest", + group_norm=False, + compress_time=False, + norm_type=None, + pad_mode="constant", + ) -> None: + super().__init__() + + self.compress_time = compress_time + if group_norm: + norm_layer = get_norm(norm_type) + self.norm = norm_layer(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.interpolation_mode = interpolation_mode + self.shortcut = shortcut + self.spatial_factor = 2 + self.temporal_factor = int(compress_time) if compress_time else 1 + out_channels = out_channels * self.spatial_factor**2 * self.temporal_factor + # self.conv = nn.Conv3d(in_channels, out_channels, 3, (1, 1, 1), 0) + self.conv = CogVideoXCausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode) + assert out_channels % in_channels == 0 + self.repeats = out_channels // in_channels + + def forward(self, hidden_states: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None, split_first=False) -> torch.Tensor: + new_conv_cache = {} + conv_cache = conv_cache or {} + + x = hidden_states + x = self.nonlinearity(self.norm(x)) + + compress_first = False + if x.shape[2] % 2 == 1 or split_first: + compress_first = True + + ### use nn.Conv3d + # x = F.pad(x, (1, 1, 1, 1, 2, 0)) # causal pad (left, right, top, bottom, front, back) + # x[:, :, :2, 1:-1, 1:-1] = x[:, :, 2:3, 1:-1, 1:-1].clone() # broadcast the first value + # x = self.conv(x) + + ### use CogVideoXCausalConv3d + x, new_conv_cache["conv"] = self.conv(x, conv_cache=conv_cache.get("conv")) + + x = rearrange(x, "b (pt ph pw c) t h w -> b c (t pt) (h ph) (w pw)", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + y = repeat(hidden_states, "b c t h w -> b (r c) t h w", r=self.repeats) + y = rearrange(y, "b (pt ph pw c) t h w -> b c (t pt) (h ph) (w pw)", pt=self.temporal_factor, ph=self.spatial_factor, pw=self.spatial_factor) + + # convert pt+pt*n -> 1+pt*n + if self.temporal_factor > 1 and compress_first: + if x.shape[2] > 1: + x_first, x_rest = x[:, :, :self.temporal_factor, ...], x[:, :, self.temporal_factor:, ...] + y_first, y_rest = y[:, :, :self.temporal_factor, ...], y[:, :, self.temporal_factor:, ...] + elif x.shape[2] == 1: + assert x.shape[2] == y.shape[2] == self.temporal_factor + x_first, x_rest = x, None + y_first, y_rest = y, None + else: + raise NotImplementedError + x = torch.cat([x_first.mean(dim=2, keepdim=True), x_rest], dim=2) + y = torch.cat([y_first.mean(dim=2, keepdim=True), y_rest], dim=2) + if self.shortcut: + hidden_states = x + y + else: + hidden_states = x + return hidden_states, new_conv_cache + +class DCDownBlock2d(nn.Module): + def __init__(self, + in_channels: int, + out_channels: int, + downsample: bool = False, + shortcut: bool = True, + group_norm=False, + pad_mode="contant", + ) -> None: + super().__init__() + if group_norm: + self.norm = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.downsample = downsample + self.factor = 2 + self.stride = 1 if downsample else 2 + self.group_size = in_channels * self.factor**2 // out_channels + self.shortcut = shortcut + + out_ratio = self.factor**2 + if downsample: + assert out_channels % out_ratio == 0 + out_channels = out_channels // out_ratio + + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=self.stride, + padding=1, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self.nonlinearity(self.norm(hidden_states)) + x = self.conv(x) + if self.downsample: + x = F.pixel_unshuffle(x, self.factor) + + if self.shortcut: + y = F.pixel_unshuffle(hidden_states, self.factor) + y = y.unflatten(1, (-1, self.group_size)) + y = y.mean(dim=2) + hidden_states = x + y + else: + hidden_states = x + + return hidden_states + +class DCUpBlock2d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + interpolate: bool = False, + shortcut: bool = True, + interpolation_mode: str = "nearest", + group_norm=False, + pad_mode="constant", + ) -> None: + super().__init__() + + if group_norm: + self.norm = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True) + self.nonlinearity = swish + else: + self.norm = nn.Identity() + self.nonlinearity = nn.Identity() + self.interpolate = interpolate + self.interpolation_mode = interpolation_mode + self.shortcut = shortcut + self.factor = 2 + self.repeats = out_channels * self.factor**2 // in_channels + + out_ratio = self.factor**2 + + if not interpolate: + out_channels = out_channels * out_ratio + + self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1, padding_mode=pad_mode) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self.nonlinearity(self.norm(hidden_states)) + if self.interpolate: + x = F.interpolate(x, scale_factor=self.factor, mode=self.interpolation_mode) + x = self.conv(x) + else: + x = self.conv(x) + x = F.pixel_shuffle(x, self.factor) + + if self.shortcut: + y = hidden_states.repeat_interleave(self.repeats, dim=1) + y = F.pixel_shuffle(y, self.factor) + hidden_states = x + y + else: + hidden_states = x + + return hidden_states + +class CogVideoXSafeConv3d(nn.Conv3d): + r""" + A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model. + """ + + def forward(self, input: torch.Tensor) -> torch.Tensor: + memory_count = ( + (input.shape[1] * input.shape[2] * input.shape[3] * input.shape[4]) * 2 / 1024**3 + ) + + # Set to 2GB, suitable for CuDNN + if memory_count > 2: + kernel_size = self.kernel_size[0] + part_num = int(memory_count / 2) + 1 + input_chunks = torch.chunk(input, part_num, dim=2) + + if kernel_size > 1: + input_chunks = [input_chunks[0]] + [ + torch.cat((input_chunks[i - 1][:, :, -kernel_size + 1 :], input_chunks[i]), dim=2) + for i in range(1, len(input_chunks)) + ] + + output_chunks = [] + for input_chunk in input_chunks: + output_chunks.append(super().forward(input_chunk)) + output = torch.cat(output_chunks, dim=2) + return output + else: + return super().forward(input) + + +class CogVideoXCausalConv3d(nn.Module): + r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model. + + Args: + in_channels (`int`): Number of channels in the input tensor. + out_channels (`int`): Number of output channels produced by the convolution. + kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel. + stride (`int`, defaults to `1`): Stride of the convolution. + dilation (`int`, defaults to `1`): Dilation rate of the convolution. + pad_mode (`str`, defaults to `"constant"`): Padding mode. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int, int]], + stride: int = 1, + dilation: int = 1, + pad_mode: str = "constant", + ): + super().__init__() + + if isinstance(kernel_size, int): + kernel_size = (kernel_size,) * 3 + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + self.pad_mode = pad_mode + time_pad = dilation * (time_kernel_size - 1) + (1 - stride) + height_pad = height_kernel_size // 2 + width_pad = width_kernel_size // 2 + + self.height_pad = height_pad + self.width_pad = width_pad + self.time_pad = time_pad + self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0) + + self.temporal_dim = 2 + self.time_kernel_size = time_kernel_size + + stride = (stride, 1, 1) + dilation = (dilation, 1, 1) + self.conv = CogVideoXSafeConv3d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + ) + + def fake_context_parallel_forward( + self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None + ) -> torch.Tensor: + kernel_size = self.time_kernel_size + + if cp.cp_on(): + conv_cache = dist_conv_cache_recv() + + if kernel_size > 1: + cached_inputs = [conv_cache.to(inputs.device)] if conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1) + inputs = torch.cat(cached_inputs + [inputs], dim=2) + return inputs + + def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None) -> torch.Tensor: + inputs = self.fake_context_parallel_forward(inputs, conv_cache) + + if cp.cp_on(): + dist_conv_cache_send(inputs[:, :, -self.time_kernel_size + 1 :]) + else: + conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() + + + padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad) + if self.pad_mode == "constant": + inputs = F.pad(inputs, padding_2d, mode="constant", value=0) + else: + _shape = inputs.shape + inputs = F.pad(inputs.view(-1, *inputs.shape[-2:]), padding_2d, mode="replicate") + inputs = inputs.view(*_shape[:-2], *inputs.shape[-2:]) + + output = self.conv(inputs) + return output, conv_cache + +class FluxConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, cnn_type="2d", cnn_slice_seq_len=17, causal_offset=0, temporal_down=False): + super().__init__() + self.cnn_type = cnn_type + self.slice_seq_len = cnn_slice_seq_len + + if cnn_type == "2d": + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding) + if cnn_type == "3d": + if temporal_down == False: + stride = (1, stride, stride) + else: + stride = (stride, stride, stride) + self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=0) + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size, kernel_size) + self.padding = ( + kernel_size[0] - 1 + causal_offset, # Temporal causal padding + padding, # Height padding + padding # Width padding + ) + self.causal_offset = causal_offset + self.stride = stride + self.kernel_size = kernel_size + + def forward(self, x): + if self.cnn_type == "2d": + if type(x) == list: + for i in range(len(x)): + x[i] = self.forward(x[i]) + return x + if x.ndim == 5: + B, C, T, H, W = x.shape + x = rearrange(x, "B C T H W -> (B T) C H W") + x = self.conv(x) + x = rearrange(x, "(B T) C H W -> B C T H W", T=T) + return x + else: + return self.conv(x) + if self.cnn_type == "3d": + if x.ndim == 5: + assert self.stride[0] == 1 or self.stride[0] == 2, f"only temporal stride = 1 or 2 are supported" + if self.stride[0] == 1: + for i in reversed(range(0, x.shape[2], self.slice_seq_len+self.stride[0]-1)): + st = i + en = min(i+self.slice_seq_len, x.shape[2]) + _x = x[:,:,st:en,:,:] + if i == 0: + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + self.padding[0], 0)) # Temporal + _x[:,:,:self.padding[0], + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,0:1,:,:].clone() # broadcast the first value + else: + padding_0 = self.kernel_size[0] - 1 + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + padding_0, 0)) # Temporal + _x[:,:,:padding_0, + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,i-padding_0:i,:,:].clone() + try: + _x = self.conv(_x) + except: + xs = [_x[:,:,:,:,i-1:i+2] for i in range(1,_x.shape[-1]-1)] + for i in range(len(xs)): + xs[i] = self.conv(xs[i]) + _x = torch.cat(xs, dim=-1) + if i == 0: + x[:,:,st-self.causal_offset:en,:,:] = _x + x = x[:,:,1:,:,:] + else: + x[:,:,st:en,:,:] = _x + else: + xs = [] + for i in range(0, x.shape[2], self.slice_seq_len+self.stride[0]-1): + st = i + en = min(i+self.slice_seq_len, x.shape[2]) + _x = x[:,:,st:en,:,:] + if i == 0: + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + self.padding[0], 0)) # Temporal + _x[:,:,:self.padding[0], + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,0:1,:,:].clone() # broadcast the first value + else: + padding_0 = self.kernel_size[0] - 1 + _x = F.pad(_x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1], # Height + padding_0, 0)) # Temporal + _x[:,:,:padding_0, + self.padding[1]:_x.shape[-2]-self.padding[1], + self.padding[2]:_x.shape[-1]-self.padding[2]] = x[:,:,i-padding_0:i,:,:].clone() + _x = self.conv(_x) + xs.append(_x) + try: + x = torch.cat(xs, dim=2) + except: + device = x.device + del x + xs = [_x.cpu().pin_memory() for _x in xs] + torch.cuda.empty_cache() + x = torch.cat([_x for _x in xs], dim=2).to(device=device) + else: + x = F.pad(x, (self.padding[2], self.padding[2], # Width + self.padding[1], self.padding[1])) # Height + weight = torch.sum(self.conv.weight, dim=2) + bias = self.conv.bias + x = F.conv2d(x, weight=weight, bias=bias,stride=self.conv.stride[1:]) + return x \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/diffaug.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/diffaug.py new file mode 100644 index 0000000000000000000000000000000000000000..19800928de02c307746f4282dd9bb965429eba9b --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/diffaug.py @@ -0,0 +1,97 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +"""Training GANs with DiffAugment.""" + +import numpy as np +import torch +import torch.nn.functional as F + + +def DiffAugment(x: torch.Tensor, policy: str = '', channels_first: bool = True) -> torch.Tensor: + if policy: + if not channels_first: + x = x.permute(0, 3, 1, 2) + for p in policy.split(','): + for f in AUGMENT_FNS[p]: + x = f(x) + if not channels_first: + x = x.permute(0, 2, 3, 1) + x = x.contiguous() + return x + + +def rand_brightness(x: torch.Tensor) -> torch.Tensor: + x = x + (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) - 0.5) + return x + + +def rand_saturation(x: torch.Tensor) -> torch.Tensor: + x_mean = x.mean(dim=1, keepdim=True) + x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) * 2) + x_mean + return x + + +def rand_contrast(x: torch.Tensor) -> torch.Tensor: + x_mean = x.mean(dim=[1, 2, 3], keepdim=True) + x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) + 0.5) + x_mean + return x + + +def rand_translation(x: torch.Tensor, ratio: float = 0.125) -> torch.Tensor: + shift_x, shift_y = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5) + translation_x = torch.randint(-shift_x, shift_x + 1, size=[x.size(0), 1, 1], device=x.device) + translation_y = torch.randint(-shift_y, shift_y + 1, size=[x.size(0), 1, 1], device=x.device) + grid_batch, grid_x, grid_y = torch.meshgrid( + torch.arange(x.size(0), dtype=torch.long, device=x.device), + torch.arange(x.size(2), dtype=torch.long, device=x.device), + torch.arange(x.size(3), dtype=torch.long, device=x.device), + ) + grid_x = torch.clamp(grid_x + translation_x + 1, 0, x.size(2) + 1) + grid_y = torch.clamp(grid_y + translation_y + 1, 0, x.size(3) + 1) + x_pad = F.pad(x, [1, 1, 1, 1, 0, 0, 0, 0]) + x = x_pad.permute(0, 2, 3, 1).contiguous()[grid_batch, grid_x, grid_y].permute(0, 3, 1, 2) + return x + + +def rand_cutout(x: torch.Tensor, ratio: float = 0.2) -> torch.Tensor: + cutout_size = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5) + offset_x = torch.randint(0, x.size(2) + (1 - cutout_size[0] % 2), size=[x.size(0), 1, 1], device=x.device) + offset_y = torch.randint(0, x.size(3) + (1 - cutout_size[1] % 2), size=[x.size(0), 1, 1], device=x.device) + grid_batch, grid_x, grid_y = torch.meshgrid( + torch.arange(x.size(0), dtype=torch.long, device=x.device), + torch.arange(cutout_size[0], dtype=torch.long, device=x.device), + torch.arange(cutout_size[1], dtype=torch.long, device=x.device), + ) + grid_x = torch.clamp(grid_x + offset_x - cutout_size[0] // 2, min=0, max=x.size(2) - 1) + grid_y = torch.clamp(grid_y + offset_y - cutout_size[1] // 2, min=0, max=x.size(3) - 1) + mask = torch.ones(x.size(0), x.size(2), x.size(3), dtype=x.dtype, device=x.device) + mask[grid_batch, grid_x, grid_y] = 0 + x = x * mask.unsqueeze(1) + return x + + +def rand_resize(x: torch.Tensor, min_ratio: float = 0.8, max_ratio: float = 1.2) -> torch.Tensor: + resize_ratio = np.random.rand()*(max_ratio-min_ratio) + min_ratio + resized_img = F.interpolate(x, size=int(resize_ratio*x.shape[3]), mode='bilinear') + org_size = x.shape[3] + if int(resize_ratio*x.shape[3]) < x.shape[3]: + left_pad = (x.shape[3]-int(resize_ratio*x.shape[3]))/2. + left_pad = int(left_pad) + right_pad = x.shape[3] - left_pad - resized_img.shape[3] + x = F.pad(resized_img, (left_pad, right_pad, left_pad, right_pad), "constant", 0.) + else: + left = (int(resize_ratio*x.shape[3])-x.shape[3])/2. + left = int(left) + x = resized_img[:, :, left:(left+x.shape[3]), left:(left+x.shape[3])] + assert x.shape[2] == org_size + assert x.shape[3] == org_size + return x + + +AUGMENT_FNS = { + 'color': [rand_brightness, rand_saturation, rand_contrast], + 'translation': [rand_translation], + 'resize': [rand_resize], + 'cutout': [rand_cutout], +} \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/drop_path.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/drop_path.py new file mode 100644 index 0000000000000000000000000000000000000000..5356b677b459203f94d31a6078b9fc17408cd15e --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/drop_path.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +# from timm.models.layers import DropPath +import torch + +def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(torch.nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob,3):0.3f}' \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/loss.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..37d9506ae21c864c289f36335d50a1b8948596c3 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/loss.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import grad + +def hinge_d_loss(logits_real, logits_fake): + loss_real = torch.mean(F.relu(1. - logits_real)) + loss_fake = torch.mean(F.relu(1. + logits_fake)) + d_loss = 0.5 * (loss_real + loss_fake) + return d_loss + +def vanilla_d_loss(logits_real, logits_fake): + d_loss = 0.5 * ( + torch.mean(torch.nn.functional.softplus(-logits_real)) + + torch.mean(torch.nn.functional.softplus(logits_fake))) + return d_loss + +def get_disc_loss(disc_loss_type): + if disc_loss_type == 'vanilla': + disc_loss = vanilla_d_loss + elif disc_loss_type == 'hinge': + disc_loss = hinge_d_loss + return disc_loss + +def adopt_weight(global_step, threshold=0, value=0., warmup=0): + if global_step < threshold or threshold < 0: + weight = value + else: + weight = 1 + if global_step - threshold < warmup: + weight = min((global_step - threshold) / warmup, 1) + return weight + +def gradient_penalty(discriminator, real_data, fake_data, device): + alpha = torch.rand(real_data.size(0), 1, device=device) + alpha = alpha.expand_as(real_data) + interpolates = alpha * real_data + ((1 - alpha) * fake_data) + interpolates = torch.autograd.Variable(interpolates, requires_grad=True) + + d_interpolates = discriminator(interpolates) + gradients = grad( + outputs=d_interpolates, + inputs=interpolates, + grad_outputs=torch.ones_like(d_interpolates, device=device), + create_graph=True, + retain_graph=True, + only_inputs=True + )[0] + + gradients = gradients.view(gradients.size(0), -1) + gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() + return gradient_penalty + +class InfoNCELoss(nn.Module): + def __init__(self, temperature: float = 0.07): + super(InfoNCELoss, self).__init__() + self.temperature = temperature + + def forward(self, features: torch.Tensor, features_prime: torch.Tensor) -> torch.Tensor: + batch_size = features.shape[0] + + # Normalize feature vectors + features = F.normalize(features, dim=1) + features_prime = F.normalize(features_prime, dim=1) + + # Concatenate features and features_prime + combined_features = torch.cat([features, features_prime], dim=0) + + # Compute similarity matrix + similarity_matrix = torch.matmul(combined_features, combined_features.T) / self.temperature + + # Mask to exclude self-similarity + mask = torch.eye(2 * batch_size, dtype=torch.bool).to(features.device) + similarity_matrix.masked_fill_(mask, float('-inf')) + + # Create labels for contrastive loss + labels = torch.arange(batch_size).to(features.device) + labels = torch.cat([labels, labels], dim=0) + + # Compute logits (separate positive and negative pairs) + positives_logits = torch.cat([similarity_matrix[:batch_size, batch_size:], similarity_matrix[batch_size:, :batch_size]], dim=0) + + # The labels are like: [0, 1, 2, ..., batch_size-1, 0, 1, 2, ..., batch_size-1] + loss = F.cross_entropy(positives_logits, labels) + + return loss \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/lpips.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/lpips.py new file mode 100644 index 0000000000000000000000000000000000000000..659c6f629c82b83fbd448dd6c2bb4ee95d135c18 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/lpips.py @@ -0,0 +1,188 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models""" + +import os, hashlib +import requests +from tqdm import tqdm + +import torch +import torch.nn as nn +from torchvision import models +from collections import namedtuple + +from infinity.models.videovae.utils.misc import bytenas_manager, set_tf32_flags + +URL_MAP = { + "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1" +} + +CKPT_MAP = { + "vgg_lpips": "vgg.pth" +} + +MD5_MAP = { + "vgg_lpips": "d507d7349b931f0638a25a48a722f98a" +} + +def download(url, local_path, chunk_size=1024): + os.makedirs(os.path.split(local_path)[0], exist_ok=True) + with requests.get(url, stream=True) as r: + total_size = int(r.headers.get("content-length", 0)) + with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: + with open(local_path, "wb") as f: + for data in r.iter_content(chunk_size=chunk_size): + if data: + f.write(data) + pbar.update(chunk_size) + + +def md5_hash(path): + with open(path, "rb") as f: + content = f.read() + return hashlib.md5(content).hexdigest() + + +def get_ckpt_path(name, root, check=False): + assert name in URL_MAP + path = os.path.join(root, CKPT_MAP[name]) + if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]): + print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path)) + download(URL_MAP[name], path) + md5 = md5_hash(path) + assert md5 == MD5_MAP[name], md5 + return path + +class ResNet50LPIPS(nn.Module): + def __init__(self): + super().__init__() + resnet50 = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) + self.lpips_net = nn.Sequential(*(list(resnet50.children())[:-2])) + self.lpips_loss = nn.MSELoss() + + def forward(self, input, target): + return self.lpips_loss(self.lpips_net(input), self.lpips_net(target),) + +class LPIPS(nn.Module): + # Learned perceptual metric + def __init__(self, use_dropout=True, upcast_tf32=False): + super().__init__() + self.upcast_tf32 = upcast_tf32 + self.scaling_layer = ScalingLayer() + self.chns = [64, 128, 256, 512, 512] # vg16 features + self.net = vgg16(pretrained=True, requires_grad=False) + self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) + self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) + self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) + self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) + self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) + self.load_from_pretrained() + for param in self.parameters(): + param.requires_grad = False + + def load_from_pretrained(self, name="vgg_lpips"): + ckpt = get_ckpt_path(name, os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")) + self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu"), weights_only=True), strict=False) + print("loaded pretrained LPIPS loss from {}".format(ckpt)) + + @classmethod + def from_pretrained(cls, name="vgg_lpips"): + if name != "vgg_lpips": + raise NotImplementedError + model = cls() + ckpt = get_ckpt_path(name, os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")) + model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu"), weights_only=True), strict=False) + return model + + def forward(self, input, target): + with set_tf32_flags(not self.upcast_tf32): + in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target)) + outs0, outs1 = self.net(in0_input), self.net(in1_input) + feats0, feats1, diffs = {}, {}, {} + lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] + for kk in range(len(self.chns)): + feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) + diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 + + res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))] + val = res[0] + for l in range(1, len(self.chns)): + # print(res[l].shape) + val += res[l] + + return val + + +class ScalingLayer(nn.Module): + def __init__(self): + super(ScalingLayer, self).__init__() + self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None]) + self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None]) + + def forward(self, inp): + return (inp - self.shift) / self.scale + + +class NetLinLayer(nn.Module): + """ A single linear layer which does a 1x1 conv """ + def __init__(self, chn_in, chn_out=1, use_dropout=False): + super(NetLinLayer, self).__init__() + layers = [nn.Dropout(), ] if (use_dropout) else [] + layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ] + self.model = nn.Sequential(*layers) + + +class vgg16(torch.nn.Module): + def __init__(self, requires_grad=False, pretrained=True): + super(vgg16, self).__init__() + # load locally + assert pretrained == True + vgg_model = models.vgg16() + vgg_model.load_state_dict(torch.load(bytenas_manager("checkpoints/vgg16-397923af.pth"), weights_only=True)) + vgg_pretrained_features = vgg_model.features + + self.slice1 = torch.nn.Sequential() + self.slice2 = torch.nn.Sequential() + self.slice3 = torch.nn.Sequential() + self.slice4 = torch.nn.Sequential() + self.slice5 = torch.nn.Sequential() + self.N_slices = 5 + for x in range(4): + self.slice1.add_module(str(x), vgg_pretrained_features[x]) + for x in range(4, 9): + self.slice2.add_module(str(x), vgg_pretrained_features[x]) + for x in range(9, 16): + self.slice3.add_module(str(x), vgg_pretrained_features[x]) + for x in range(16, 23): + self.slice4.add_module(str(x), vgg_pretrained_features[x]) + for x in range(23, 30): + self.slice5.add_module(str(x), vgg_pretrained_features[x]) + if not requires_grad: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, X): + h = self.slice1(X) + h_relu1_2 = h + h = self.slice2(h) + h_relu2_2 = h + h = self.slice3(h) + h_relu3_3 = h + h = self.slice4(h) + h_relu4_3 = h + h = self.slice5(h) + h_relu5_3 = h + vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3']) + out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) + return out + + +def normalize_tensor(x,eps=1e-10): + # norm_factor = torch.sqrt(torch.sum(x**2,dim=1,keepdim=True)) + norm_factor = x.norm(p=2, dim=1, keepdim=True) + return x/(norm_factor+eps) + + +def spatial_average(x, keepdim=True): + return x.mean([2,3],keepdim=keepdim) diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/misc.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..0320148bc517eed0be097e1b4b9c54a0c9a21e23 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/misc.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch + +def swish(x): + if type(x) == list: + for i in range(len(x)): + x[i] = swish(x[i]) + return x + try: + return x*torch.sigmoid(x) + except: + for _i in range(x.shape[2]): + x[:,:,_i:_i+1,:,:] = x[:,:,_i:_i+1,:,:]*torch.sigmoid(x[:,:,_i:_i+1,:,:]) + return x \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/normalization.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..fb85ae0ea457ac8b7b15f20a542c0db2706e70c8 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/normalization.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + + +class SpatialGroupNorm(nn.GroupNorm): + def __init__(self, *args, **kwargs): + super(SpatialGroupNorm, self).__init__(*args, **kwargs) + + def shard_norm(self, x): + dtype = x.dtype + x = x.to(torch.float32) + with torch.amp.autocast("cuda", torch.float32): + for _i in range(x.shape[0]): + x[_i:_i+1,...] = super(SpatialGroupNorm, self).forward(x[_i:_i+1,...]) + x = x.to(dtype=dtype) + return x + + def forward(self, x): + dtype = x.dtype + x = x.to(torch.float32) + assert x.ndim == 5 + T = x.shape[2] + x = rearrange(x, "B C T H W -> (B T) C H W") + try: + x = super(SpatialGroupNorm, self).forward(x) + except: + x = self.shard_norm(x) # shard norm if OOM fallback + x = rearrange(x, "(B T) C H W -> B C T H W", T=T) + x = x.to(dtype=dtype) + return x + +class Normalize(nn.Module): + def __init__(self, in_channels, norm_type, norm_axis="spatial"): + super().__init__() + self.norm_axis = norm_axis + assert norm_type in ['group', 'batch', "no"] + if norm_type == 'group': + if in_channels % 32 == 0: + self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + elif in_channels % 24 == 0: + self.norm = nn.GroupNorm(num_groups=24, num_channels=in_channels, eps=1e-6, affine=True) + else: + raise NotImplementedError + elif norm_type == 'batch': + self.norm = nn.SyncBatchNorm(in_channels, track_running_stats=False) # Runtime Error: grad inplace if set track_running_stats to True + elif norm_type == 'no': + self.norm = nn.Identity() + + def _norm(self, x): + try: + x = self.norm(x) + except: + device = x.device + self.norm_cpu = self.norm.cpu() + x = self.norm_cpu(x.cpu().pin_memory()).to(device=device) + return x + + def shard_norm(self, x): + dtype = x.dtype + x = x.to(torch.float32) + with torch.amp.autocast("cuda", torch.float32): + for _i in range(x.shape[0]): + x[_i:_i+1,...] = self.norm(x[_i:_i+1,...]) + x = x.to(dtype=dtype) + return x + + def forward(self, x): + if self.norm_axis == "spatial": + if type(x) == list: + for i in range(len(x)): + x[i] = self.norm(x[i]) + return x + if x.ndim == 4: + try: + x = self.norm(x) + except: + x = self.shard_norm(x) + else: + B, C, T, H, W = x.shape + x = rearrange(x, "B C T H W -> (B T) C H W") + # x = self.shard_norm(x) + try: + x = self.norm(x) + except: + x = self.shard_norm(x) + x = rearrange(x, "(B T) C H W -> B C T H W", T=T) + elif self.norm_axis == "spatial-temporal": + x = self._norm(x) + else: + raise NotImplementedError + return x + +def l2norm(t): + return F.normalize(t, dim=-1) + +class LayerNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.gamma = nn.Parameter(torch.ones(dim)) + self.register_buffer("beta", torch.zeros(dim)) + + def forward(self, x): + return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta) + +# https://github.com/huggingface/transformers/blob/2f12e408225b1ebceb0d2f701ce419d46678dc31/src/transformers/models/llama/modeling_llama.py#L76 +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states, sp_slice=None): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + if sp_slice is None: + return (self.weight * hidden_states).to(input_dtype) + else: + return (self.weight[sp_slice] * hidden_states).to(input_dtype) # torch.float32 * torchbfloat16 in DDP will cast to torch.float32 diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/normalization_wan.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/normalization_wan.py new file mode 100644 index 0000000000000000000000000000000000000000..a301df8f4214db3e4d6118d3b76637edc343f0a7 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/normalization_wan.py @@ -0,0 +1,155 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + + +def get_norm(norm_type): + if norm_type == "spatial-group": + return SpatialGroupNorm + elif norm_type == "rms": + return RMS_norm + elif norm_type == "group": + return nn.GroupNorm + else: + raise NotImplementedError + +class RMS_norm(nn.Module): + + def __init__(self, num_channels, channel_first=True, bias=False, **kwargs): + super().__init__() + broadcastable_dims = (1, 1, 1) + shape = (num_channels, *broadcastable_dims) + + self.channel_first = channel_first + self.scale = num_channels**0.5 + self.gamma = nn.Parameter(torch.ones(shape)) + self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0. + + def forward(self, x): + return F.normalize( + x, dim=(1 if self.channel_first else + -1)) * self.scale * self.gamma + self.bias + +class SpatialGroupNorm(nn.GroupNorm): + def __init__(self, *args, **kwargs): + super(SpatialGroupNorm, self).__init__(*args, **kwargs) + + def shard_norm(self, x): + dtype = x.dtype + x = x.to(torch.float32) + with torch.amp.autocast("cuda", torch.float32): + for _i in range(x.shape[0]): + x[_i:_i+1,...] = super(SpatialGroupNorm, self).forward(x[_i:_i+1,...]) + x = x.to(dtype=dtype) + return x + + def forward(self, x): + dtype = x.dtype + x = x.to(torch.float32) + assert x.ndim == 5 + T = x.shape[2] + x = rearrange(x, "B C T H W -> (B T) C H W") + try: + x = super(SpatialGroupNorm, self).forward(x) + except: + x = self.shard_norm(x) # shard norm if OOM fallback + x = rearrange(x, "(B T) C H W -> B C T H W", T=T) + x = x.to(dtype=dtype) + return x + +class Normalize(nn.Module): + def __init__(self, in_channels, norm_type, norm_axis="spatial"): + super().__init__() + self.norm_axis = norm_axis + assert norm_type in ['group', 'batch', "no"] + if norm_type == 'group': + if in_channels % 32 == 0: + self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + elif in_channels % 24 == 0: + self.norm = nn.GroupNorm(num_groups=24, num_channels=in_channels, eps=1e-6, affine=True) + else: + raise NotImplementedError + elif norm_type == 'batch': + self.norm = nn.SyncBatchNorm(in_channels, track_running_stats=False) # Runtime Error: grad inplace if set track_running_stats to True + elif norm_type == 'no': + self.norm = nn.Identity() + + def _norm(self, x): + try: + x = self.norm(x) + except: + device = x.device + self.norm_cpu = self.norm.cpu() + x = self.norm_cpu(x.cpu().pin_memory()).to(device=device) + return x + + def shard_norm(self, x): + dtype = x.dtype + x = x.to(torch.float32) + with torch.amp.autocast("cuda", torch.float32): + for _i in range(x.shape[0]): + x[_i:_i+1,...] = self.norm(x[_i:_i+1,...]) + x = x.to(dtype=dtype) + return x + + def forward(self, x): + if self.norm_axis == "spatial": + if type(x) == list: + for i in range(len(x)): + x[i] = self.norm(x[i]) + return x + if x.ndim == 4: + try: + x = self.norm(x) + except: + x = self.shard_norm(x) + else: + B, C, T, H, W = x.shape + x = rearrange(x, "B C T H W -> (B T) C H W") + # x = self.shard_norm(x) + try: + x = self.norm(x) + except: + x = self.shard_norm(x) + x = rearrange(x, "(B T) C H W -> B C T H W", T=T) + elif self.norm_axis == "spatial-temporal": + x = self._norm(x) + else: + raise NotImplementedError + return x + +def l2norm(t): + return F.normalize(t, dim=-1) + +class LayerNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.gamma = nn.Parameter(torch.ones(dim)) + self.register_buffer("beta", torch.zeros(dim)) + + def forward(self, x): + return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta) + +# https://github.com/huggingface/transformers/blob/2f12e408225b1ebceb0d2f701ce419d46678dc31/src/transformers/models/llama/modeling_llama.py#L76 +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states, sp_slice=None): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + if sp_slice is None: + return (self.weight * hidden_states).to(input_dtype) + else: + return (self.weight[sp_slice] * hidden_states).to(input_dtype) # torch.float32 * torchbfloat16 in DDP will cast to torch.float32 diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__init__.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b91d257f0fed0f07f195d57dfd74cb5d5408ce9f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from .multiscale_bsq import MultiScaleBSQ +from .multiscale_fsq_tp import MultiScaleFSQTP +from .multiscale_bsq_tp import MultiScaleBSQTP +from .multiscale_bsq_tp_absorb_patchify import MultiScaleBSQTP as MultiScaleBSQTP_AP \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c68823d3a2b93f56863e3b1629d6d654967564d2 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/dynamic_resolution.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/dynamic_resolution.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4543911090c50836265cd6f3f04e111da1590c5c Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/dynamic_resolution.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/finite_scalar_quantization.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/finite_scalar_quantization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8427d0dd6a438bc99383fa2be6c0fd3c6d902097 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/finite_scalar_quantization.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3b2304832b00aede9564148734addeb511150d3 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq_tp.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq_tp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..761efb3baaefbe2401993b6e1804483a47c8657b Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq_tp.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq_tp_absorb_patchify.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq_tp_absorb_patchify.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db9704d2454530501dccb511a1db3f506413bd1c Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_bsq_tp_absorb_patchify.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_fsq_tp.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_fsq_tp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4c40fd8ffce2007d868a0f248801f551619dbee Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/__pycache__/multiscale_fsq_tp.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/bsq.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/bsq.py new file mode 100644 index 0000000000000000000000000000000000000000..75ca3eb0ebbcc87dedc13d3dfd62f5af07c132e1 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/bsq.py @@ -0,0 +1,440 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Binary Spherical Quantization +Proposed in https://arxiv.org/abs/2406.07548 + +In the simplest setup, each dimension is quantized into {-1, 1}. +An entropy penalty is used to encourage utilization. +""" + +import random +import copy +from math import log2, ceil +from functools import partial, cache +from collections import namedtuple +from contextlib import nullcontext + +import torch.distributed as dist +from torch.distributed import nn as dist_nn + +import torch +from torch import nn, einsum +import torch.nn.functional as F +from torch.nn import Module +from torch.amp import autocast +import numpy as np + +from einops import rearrange, reduce, pack, unpack + +# from einx import get_at + +# print(f"{dynamic_resolution_thw=}") + +# constants + +Return = namedtuple('Return', ['quantized', 'indices', 'bit_indices', 'entropy_aux_loss']) + +LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment']) + +# distributed helpers + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + + dist_nn.all_reduce(t) + t = t / dist.get_world_size() + return t + +# helper functions + +def exists(v): + return v is not None + +def identity(t): + return t + +def default(*args): + for arg in args: + if exists(arg): + return arg() if callable(arg) else arg + return None + +def round_up_multiple(num, mult): + return ceil(num / mult) * mult + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +def l2norm(t): + return F.normalize(t, dim = -1) + +# entropy + +def log(t, eps = 1e-5): + return t.clamp(min = eps).log() + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + +# cosine sim linear + +class CosineSimLinear(Module): + def __init__( + self, + dim_in, + dim_out, + scale = 1. + ): + super().__init__() + self.scale = scale + self.weight = nn.Parameter(torch.randn(dim_in, dim_out)) + + def forward(self, x): + x = F.normalize(x, dim = -1) + w = F.normalize(self.weight, dim = 0) + return (x @ w) * self.scale + +def repeat_schedule(scale_schedule, repeat_scales_num, times): + new_scale_schedule = [] + for i in range(repeat_scales_num): + new_scale_schedule.extend([scale_schedule[i] for _ in range(times)]) + new_scale_schedule.extend(scale_schedule[repeat_scales_num:]) + return new_scale_schedule + + +class BSQ(Module): + def __init__( + self, + *, + dim = None, + entropy_loss_weight = 0.1, + commitment_loss_weight = 0.25, + num_codebooks = 1, + keep_num_codebooks_dim = None, + codebook_scale = 1., # for residual LFQ, codebook scaled down by 2x at each layer + frac_per_sample_entropy = 1., # make less than 1. to only use a random fraction of the probs for per sample entropy + soft_clamp_input_value = None, + channel_first = None, + experimental_softplus_entropy_loss = False, + entropy_loss_offset = 5., # how much to shift the loss before softplus + spherical = True, # from https://arxiv.org/abs/2406.07548 + force_quantization_f32 = True, # will force the quantization step to be full precision + inv_temperature = 100.0, + gamma0=1.0, gamma=1.0, zeta=1.0, + use_out_phi = False, # use output phi network + use_out_phi_res = False, # residual out phi + use_bernoulli = False, + use_rot_trick = False, + ): + super().__init__() + + # some assert validations + assert exists(dim) , 'dim must be specified for BSQ' + + codebook_dim = dim + codebook_dims = codebook_dim * num_codebooks + dim = default(dim, codebook_dims) + self.codebook_dims = codebook_dims + + self.out_phi = nn.Linear(codebook_dims, codebook_dims) if use_out_phi else nn.Identity() + self.use_out_phi_res = use_out_phi_res + if self.use_out_phi_res: + self.out_phi_scale = nn.Parameter(torch.zeros(codebook_dims), requires_grad=True) # init as zero + + self.dim = dim + self.codebook_dim = codebook_dim + self.num_codebooks = num_codebooks + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + # channel first + self.channel_first = channel_first + + # For BSQ (binary spherical quantization) + if not spherical: + raise ValueError("For BSQ, spherical must be True.") + self.persample_entropy_compute = 'analytical' + self.inv_temperature = inv_temperature + self.gamma0 = gamma0 # loss weight for entropy penalty + self.gamma = gamma # loss weight for entropy penalty + self.zeta = zeta # loss weight for entire entropy penalty + self.use_bernoulli = use_bernoulli + self.use_rot_trick = use_rot_trick + + # entropy aux loss related weights + + assert 0 < frac_per_sample_entropy <= 1. + self.frac_per_sample_entropy = frac_per_sample_entropy + + self.entropy_loss_weight = entropy_loss_weight + + # codebook scale + + self.codebook_scale = codebook_scale + + # commitment loss + + self.commitment_loss_weight = commitment_loss_weight + + # whether to soft clamp the input value from -value to value + + self.soft_clamp_input_value = soft_clamp_input_value + assert not exists(soft_clamp_input_value) or soft_clamp_input_value >= codebook_scale + + # whether to make the entropy loss positive through a softplus (experimental, please report if this worked or not in discussions) + + self.entropy_loss_offset = entropy_loss_offset + self.experimental_softplus_entropy_loss = experimental_softplus_entropy_loss + + # for no auxiliary loss, during inference + + self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1)) + self.register_buffer('zero', torch.tensor(0.), persistent = False) + + # whether to force quantization step to be f32 + + self.force_quantization_f32 = force_quantization_f32 + + def bits_to_codes(self, bits): + return bits * self.codebook_scale * 2 - self.codebook_scale + + # @property + # def dtype(self): + # return self.codebook.dtype + + def indices_to_codes( + self, + indices, + label_type = 'int_label', + project_out = True + ): + assert label_type in ['int_label', 'bit_label'] + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + should_transpose = default(self.channel_first, is_img_or_video) + + if not self.keep_num_codebooks_dim: + if label_type == 'int_label': + indices = rearrange(indices, '... -> ... 1') + else: + indices = indices.unsqueeze(-2) + + # indices to codes, which are bits of either -1 or 1 + + if label_type == 'int_label': + assert indices[..., None].int().min() > 0 + bits = ((indices[..., None].int() & self.mask) != 0).float() # .to(self.dtype) + else: + bits = indices + + codes = self.bits_to_codes(bits).float() + + codes = l2norm(codes) # must normalize when using BSQ + + codes = rearrange(codes, '... c d -> ... (c d)') + + # whether to project codes out to original dimensions + # if the input feature dimensions were not log2(codebook size) + + # rearrange codes back to original shape + + if should_transpose: + codes = rearrange(codes, 'b ... d -> b d ...') + + return codes + + def quantize(self, z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def quantize_new_bernoulli(self, z, prob_z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = (torch.bernoulli(prob_z) - 0.5) * 2.0 + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def rot_quantize(self, z, inference=False): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) * q_scale + if inference: + return zhat + + w = ((z + zhat) / torch.norm(z + zhat, dim=-1, keepdim=True)).detach() + z = z.unsqueeze(1) - 2*torch.bmm(torch.bmm(z.unsqueeze(1), w.unsqueeze(-1)), w.unsqueeze(1)) + 2 * torch.bmm( + torch.bmm(z.unsqueeze(1), z.unsqueeze(-1).detach()), zhat.unsqueeze(1).detach()) + return z.squeeze() + + def soft_entropy_loss(self, z): + if self.persample_entropy_compute == 'analytical': + # if self.l2_norm: + p = torch.sigmoid(-4 * z / (self.codebook_dims ** 0.5) * self.inv_temperature) + # else: + # p = torch.sigmoid(-4 * z * self.inv_temperature) + prob = torch.stack([p, 1-p], dim=-1) # (b, h, w, 18, 2) + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() # (b,h,w,18)->(b,h,w)->scalar + else: + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() + + # macro average of the probability of each subgroup + avg_prob = reduce(prob, '... g d ->g d', 'mean') # (18, 2) + codebook_entropy = self.get_entropy(avg_prob, dim=-1, normalize=False) + + # the approximation of the entropy is the sum of the entropy of each subgroup + return per_sample_entropy, codebook_entropy.sum(), avg_prob + + def get_entropy(self, count, dim=-1, eps=1e-4, normalize=True): + if normalize: # False + probs = (count + eps) / (count + eps).sum(dim=dim, keepdim =True) + else: # True + probs = count + H = -(probs * torch.log(probs + 1e-8)).sum(dim=dim) + return H + + def forward( + self, + x, + return_loss_breakdown = False, + mask = None, + entropy_weight=0.1 + ): + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension, which is also log2(codebook size) + c - number of codebook dim + """ + + is_img_or_video = x.ndim >= 4 + should_transpose = default(self.channel_first, is_img_or_video) + + # standardize image or video into (batch, seq, dimension) + + if should_transpose: + x = rearrange(x, 'b d ... -> b ... d') + x, ps = pack_one(x, 'b * d') # x.shape [b, hwt, c] + + assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}' + + # split out number of codebooks + + x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks) + + if self.use_bernoulli: + prob_x = torch.sigmoid(x) + + x = l2norm(x) + + # whether to force quantization step to be full precision or not + + force_f32 = self.force_quantization_f32 + + quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext + + with quantization_context(): + + if force_f32: + orig_dtype = x.dtype + x = x.float() + + # use straight-through gradients + if self.use_rot_trick: + x_f = x.flatten(end_dim=-2) # (b, hwt, 1, d) -> (bhwt, d) + q_f = self.rot_quantize(x_f, inference= not self.training) + quantized = q_f.reshape(x.shape) + elif self.use_bernoulli: + quantized = self.quantize_new_bernoulli(x, prob_x) + else: + quantized = self.quantize(x) + + # calculate indices + indices = reduce((quantized > 0).int() * self.mask.int(), 'b n c d -> b n c', 'sum') + bit_indices = (quantized > 0).int() + + # entropy aux loss + if self.training: + persample_entropy, cb_entropy, avg_prob = self.soft_entropy_loss(x) # compute entropy + entropy_penalty = self.gamma0 * persample_entropy - self.gamma * cb_entropy + else: + # if not training, just return dummy 0 + entropy_penalty = persample_entropy = cb_entropy = self.zero + + # commit loss + + if self.training and self.commitment_loss_weight > 0.: + + commit_loss = F.mse_loss(x, quantized.detach(), reduction = 'none') + + if exists(mask): + commit_loss = commit_loss[mask] + + commit_loss = commit_loss.mean() + else: + commit_loss = self.zero + + # input back to original dtype if needed + + if force_f32: + x = x.type(orig_dtype) + + # merge back codebook dim + x = quantized # rename quantized to x for output + + if self.use_out_phi_res: + x = x + self.out_phi_scale * self.out_phi(x) # apply out_phi on quant output as residual + else: + x = self.out_phi(x) # apply out_phi on quant output + + x = rearrange(x, 'b n c d -> b n (c d)') + + # reconstitute image or video dimensions + + if should_transpose: + x = unpack_one(x, ps, 'b * d') + x = rearrange(x, 'b ... d -> b d ...') + + bit_indices = unpack_one(bit_indices, ps, 'b * c d') + + # whether to remove single codebook dim + + if not self.keep_num_codebooks_dim: + bit_indices = rearrange(bit_indices, '... 1 d -> ... d') + + # complete aux loss + + aux_loss = commit_loss * self.commitment_loss_weight + (self.zeta * entropy_penalty / self.inv_temperature)*entropy_weight + # returns + + ret = Return(x, indices, bit_indices, aux_loss) + + if not return_loss_breakdown: + return ret + + return ret, LossBreakdown(persample_entropy, cb_entropy, commit_loss) + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/dynamic_resolution.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/dynamic_resolution.py new file mode 100644 index 0000000000000000000000000000000000000000..a35623c3412cff7c2c9c25cc55a7c94314dcb16b --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/dynamic_resolution.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import json +import numpy as np +import tqdm + +vae_stride = 16 +ratio2hws = { + 1.000: [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16),(20,20),(24,24),(32,32),(40,40),(48,48),(64,64)], + 1.250: [(1,1),(2,2),(3,3),(5,4),(10,8),(15,12),(20,16),(25,20),(30,24),(35,28),(45,36),(55,44),(70,56)], + 1.333: [(1,1),(2,2),(4,3),(8,6),(12,9),(16,12),(20,15),(24,18),(28,21),(36,27),(48,36),(60,45),(72,54)], + 1.500: [(1,1),(2,2),(3,2),(6,4),(9,6),(15,10),(21,14),(27,18),(33,22),(39,26),(48,32),(63,42),(78,52)], + 1.750: [(1,1),(2,2),(3,3),(7,4),(11,6),(14,8),(21,12),(28,16),(35,20),(42,24),(56,32),(70,40),(84,48)], + 2.000: [(1,1),(2,2),(4,2),(6,3),(10,5),(16,8),(22,11),(30,15),(38,19),(46,23),(60,30),(74,37),(90,45)], + 2.500: [(1,1),(2,2),(5,2),(10,4),(15,6),(20,8),(25,10),(30,12),(40,16),(50,20),(65,26),(80,32),(100,40)], + 3.000: [(1,1),(2,2),(6,2),(9,3),(15,5),(21,7),(27,9),(36,12),(45,15),(54,18),(72,24),(90,30),(111,37)], +} +full_ratio2hws = {} +for ratio, hws in ratio2hws.items(): + full_ratio2hws[ratio] = hws + full_ratio2hws[int(1/ratio*1000)/1000] = [(item[1], item[0]) for item in hws] + +dynamic_resolution_h_w = {} +predefined_HW_Scales_dynamic = {} +for ratio in full_ratio2hws: + dynamic_resolution_h_w[ratio] ={} + for ind, leng in enumerate([7, 10, 13]): + h, w = full_ratio2hws[ratio][leng-1][0], full_ratio2hws[ratio][leng-1][1] # feature map size + pixel = (h * vae_stride, w * vae_stride) # The original image (H, W) + dynamic_resolution_h_w[ratio][pixel[1]] = { + 'pixel': pixel, + 'scales': full_ratio2hws[ratio][:leng] + } # W as key + predefined_HW_Scales_dynamic[(h, w)] = full_ratio2hws[ratio][:leng] \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/finite_scalar_quantization.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/finite_scalar_quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..949a29a9f269b1886d1742473a4ee6f763f0b09e --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/finite_scalar_quantization.py @@ -0,0 +1,198 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505 +Code adapted from Jax version in Appendix A.1 +""" + +from typing import List, Optional + +import torch +import torch.nn as nn +from torch.nn import Module +from torch import Tensor, int32 +from torch.cuda.amp import autocast + +from einops import rearrange, pack, unpack + +# helper functions + +def exists(v): + return v is not None + +def default(*args): + for arg in args: + if exists(arg): + return arg + return None + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +# tensor helpers + +def round_ste(z: Tensor) -> Tensor: + """Round with straight through gradients.""" + zhat = z.round() + return z + (zhat - z).detach() + +# main class + +class FSQ(Module): + def __init__( + self, + num_lvl: int, + # levels: List[int], + dim: Optional[int] = None, + num_codebooks = 1, + keep_num_codebooks_dim: Optional[bool] = None, + scale: Optional[float] = None + ): + super().__init__() + levels = [num_lvl] * dim + _levels = torch.tensor(levels, dtype=int32) + self.register_buffer("_levels", _levels, persistent = False) + + # _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32) + # self.register_buffer("_basis", _basis, persistent = False) + + self.scale = scale + + codebook_dim = len(levels) + self.codebook_dim = codebook_dim + + effective_codebook_dim = codebook_dim * num_codebooks + self.num_codebooks = num_codebooks + self.effective_codebook_dim = effective_codebook_dim + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + self.dim = default(dim, len(_levels) * num_codebooks) + + has_projections = self.dim != effective_codebook_dim + self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity() + self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity() + self.has_projections = has_projections + + # self.codebook_size = self._levels.prod().item() + + # implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out = False) + # self.register_buffer("implicit_codebook", implicit_codebook, persistent = False) + + def bound(self, z: Tensor, eps: float = 1e-3) -> Tensor: + """Bound `z`, an array of shape (..., d).""" + half_l = (self._levels - 1) * (1 - eps) / 2 + offset = torch.where(self._levels % 2 == 0, 0.5, 0.0) + shift = (offset / half_l).tan() + return (z + shift).tanh() * half_l - offset + + def quantize(self, z: Tensor) -> Tensor: + """Quantizes z, returns quantized zhat, same shape as z.""" + quantized = round_ste(self.bound(z)) # -2, -1, 0, 1, 2 for L=5 + half_width = self._levels // 2 # Renormalize to [-1, 1]. half_width = 2 for L=5 + return quantized / half_width + + def _scale_and_shift(self, zhat_normalized: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat_normalized * half_width) + half_width + + def _scale_and_shift_inverse(self, zhat: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat - half_width) / half_width + + def codes_to_indices(self, zhat: Tensor) -> Tensor: + """Converts a `code` to an index in the codebook.""" + assert zhat.shape[-1] == self.codebook_dim + zhat = self._scale_and_shift(zhat) # {-1, -1/2, 0, 1/2, 1} -> {-2, -1, 0, 1, 2} -> {0, 1, 2, 3, 4} + # return (zhat * self._basis).sum(dim=-1).to(int32) + return zhat.to(int32) + + def indices_to_codes( + self, + indices: Tensor, + project_out = True, + **kwargs, + ) -> Tensor: + """Inverse of `codes_to_indices`.""" + + # is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + + # if is_img_or_video: + # indices = rearrange(indices, 'b d ... -> b ... d') + + # indices = rearrange(indices, '... -> ... 1') + # codes_non_centered = (indices // self._basis) % self._levels + codes = self._scale_and_shift_inverse(indices) # {0, 1, 2, 3, 4} -> {-1, -1/2, 0, 1/2, 1} + + # if self.keep_num_codebooks_dim: + # codes = rearrange(codes, '... c d -> ... (c d)') + + if project_out: + codes = self.project_out(codes) + + # if is_img_or_video: + codes = rearrange(codes, 'b ... d -> b d ...') + + return codes + + @autocast(enabled = False) + def forward(self, z: Tensor) -> Tensor: + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension + c - number of codebook dim + """ + + is_img_or_video = z.ndim >= 4 + + # standardize image or video into (batch, seq, dimension) + + if is_img_or_video: + z = rearrange(z, 'b d ... -> b ... d') # (b, c, t, h, w) -> (b, t, h, w, c) + # z, ps = pack_one(z, 'b * d') # (b, thw, c), (t, h, w) + + assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}' + + z = self.project_in(z) + + # z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks) # (b, thw, 1, c) + + codes = self.quantize(z) + indices = self.codes_to_indices(codes) + + # codes = rearrange(codes, 'b n c d -> b n (c d)') + + out = self.project_out(codes) + + # reconstitute image or video dimensions + + if is_img_or_video: + # out = unpack_one(out, ps, 'b * d') + out = rearrange(out, 'b ... d -> b d ...') + # indices = rearrange(indices, 'b ... d -> b d ...') + + + # # indices = unpack_one(indices, ps, 'b * c') + + # if not self.keep_num_codebooks_dim: + # # indices = rearrange(indices, '... 1 -> ...') + # pass + + return out, None, indices, None + + +if __name__ == "__main__": + num_lvl = 5 + dim = 16 + T, H, W = 21, 32, 32 + quantizer = FSQ(num_lvl, dim) + z = torch.randn(2, dim, T, H, W) + out, indices = quantizer(z) \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/lookup_free_quantization.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/lookup_free_quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..b0af6da47fc46093843b8b5993a0ec00937d8734 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/lookup_free_quantization.py @@ -0,0 +1,304 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Lookup Free Quantization +Proposed in https://arxiv.org/abs/2310.05737 + +In the simplest setup, each dimension is quantized into {-1, 1}. +An entropy penalty is used to encourage utilization. +""" + +from math import log2, ceil +from collections import namedtuple + +import torch +from torch import nn, einsum +import torch.nn.functional as F +from torch.nn import Module +from torch.cuda.amp import autocast + +from einops import rearrange, reduce, pack, unpack + +# constants + +Return = namedtuple('Return', ['quantized', 'indices', 'entropy_aux_loss']) + +LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment']) + +# helper functions + +def exists(v): + return v is not None + +def default(*args): + for arg in args: + if exists(arg): + return arg() if callable(arg) else arg + return None + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +# entropy + +def log(t, eps = 1e-5): + return t.clamp(min = eps).log() + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + +# class + +class LFQ(Module): + def __init__( + self, + *, + dim = None, + codebook_size = None, + entropy_loss_weight = 0.1, + commitment_loss_weight = 0.25, + diversity_gamma = 1., + straight_through_activation = nn.Identity(), + num_codebooks = 1, + keep_num_codebooks_dim = None, + codebook_scale = 1., # for residual LFQ, codebook scaled down by 2x at each layer + frac_per_sample_entropy = 1. # make less than 1. to only use a random fraction of the probs for per sample entropy + ): + super().__init__() + + # some assert validations + + assert exists(dim) or exists(codebook_size), 'either dim or codebook_size must be specified for LFQ' + assert not exists(codebook_size) or log2(codebook_size).is_integer(), f'your codebook size must be a power of 2 for lookup free quantization (suggested {2 ** ceil(log2(codebook_size))})' + + codebook_size = default(codebook_size, lambda: 2 ** dim) + codebook_dim = int(log2(codebook_size)) + + codebook_dims = codebook_dim * num_codebooks + dim = default(dim, codebook_dims) + + has_projections = dim != codebook_dims + self.project_in = nn.Linear(dim, codebook_dims) if has_projections else nn.Identity() + self.project_out = nn.Linear(codebook_dims, dim) if has_projections else nn.Identity() + self.has_projections = has_projections + + self.dim = dim + self.codebook_dim = codebook_dim + self.num_codebooks = num_codebooks + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + # straight through activation + + self.activation = straight_through_activation + + # entropy aux loss related weights + + assert 0 < frac_per_sample_entropy <= 1. + self.frac_per_sample_entropy = frac_per_sample_entropy + + self.diversity_gamma = diversity_gamma + self.entropy_loss_weight = entropy_loss_weight + + # codebook scale + + self.codebook_scale = codebook_scale + + # commitment loss + + self.commitment_loss_weight = commitment_loss_weight + + # for no auxiliary loss, during inference + + self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1)) + self.register_buffer('zero', torch.tensor(0.), persistent = False) + + # codes + + all_codes = torch.arange(codebook_size) + bits = ((all_codes[..., None].int() & self.mask) != 0).float() + codebook = self.bits_to_codes(bits) + + self.register_buffer('codebook', codebook, persistent = False) + + def bits_to_codes(self, bits): + return bits * self.codebook_scale * 2 - self.codebook_scale + + @property + def dtype(self): + return self.codebook.dtype + + def indices_to_codes( + self, + indices, + project_out = True + ): + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, '... -> ... 1') + + # indices to codes, which are bits of either -1 or 1 + + bits = ((indices[..., None].int() & self.mask) != 0).to(self.dtype) + + codes = self.bits_to_codes(bits) + + codes = rearrange(codes, '... c d -> ... (c d)') + + # whether to project codes out to original dimensions + # if the input feature dimensions were not log2(codebook size) + + if project_out: + codes = self.project_out(codes) + + # rearrange codes back to original shape + + if is_img_or_video: + codes = rearrange(codes, 'b ... d -> b d ...') + + return codes + + @autocast(enabled = False) + def forward( + self, + x, + inv_temperature = 100., + mask = None, + ): + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension, which is also log2(codebook size) + c - number of codebook dim + """ + + x = x.float() + + is_img_or_video = x.ndim >= 4 + + # standardize image or video into (batch, seq, dimension) + + if is_img_or_video: + x = rearrange(x, 'b d ... -> b ... d') + x, ps = pack_one(x, 'b * d') + + assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}' + + x = self.project_in(x) + + # split out number of codebooks + + x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks) + + # quantize by eq 3. + + original_input = x + + codebook_value = torch.ones_like(x) * self.codebook_scale + quantized = torch.where(x > 0, codebook_value, -codebook_value) + + # use straight-through gradients (optionally with custom activation fn) if training + + if self.training: + x = self.activation(x) + x = x + (quantized - x).detach() + else: + x = quantized + + # calculate indices + + indices = reduce((x > 0).int() * self.mask.int(), 'b n c d -> b n c', 'sum') + + # entropy aux loss + + if self.training: + # the same as euclidean distance up to a constant + distance = -2 * einsum('... i d, j d -> ... i j', original_input, self.codebook) + + prob = (-distance * inv_temperature).softmax(dim = -1) + + # account for mask + + if exists(mask): + prob = prob[mask] + else: + prob = rearrange(prob, 'b n ... -> (b n) ...') + + # whether to only use a fraction of probs, for reducing memory + + if self.frac_per_sample_entropy < 1.: + num_tokens = prob.shape[0] + num_sampled_tokens = int(num_tokens * self.frac_per_sample_entropy) + rand_mask = torch.randn(num_tokens).argsort(dim = -1) < num_sampled_tokens + per_sample_probs = prob[rand_mask] + else: + per_sample_probs = prob + + # calculate per sample entropy + + per_sample_entropy = entropy(per_sample_probs).mean() + + # distribution over all available tokens in the batch + + avg_prob = reduce(per_sample_probs, '... c d -> c d', 'mean') + codebook_entropy = entropy(avg_prob).mean() + + # 1. entropy will be nudged to be low for each code, to encourage the network to output confident predictions + # 2. codebook entropy will be nudged to be high, to encourage all codes to be uniformly used within the batch + + entropy_aux_loss = per_sample_entropy - self.diversity_gamma * codebook_entropy + else: + # if not training, just return dummy 0 + entropy_aux_loss = per_sample_entropy = codebook_entropy = self.zero + + # commit loss + + if self.training: + commit_loss = F.mse_loss(original_input, quantized.detach(), reduction = 'none') + + if exists(mask): + commit_loss = commit_loss[mask] + + commit_loss = commit_loss.mean() + else: + commit_loss = self.zero + + # merge back codebook dim + + x = rearrange(x, 'b n c d -> b n (c d)') + + # project out to feature dimension if needed + + x = self.project_out(x) + + # reconstitute image or video dimensions + + if is_img_or_video: + x = unpack_one(x, ps, 'b * d') + x = rearrange(x, 'b ... d -> b d ...') + + indices = unpack_one(indices, ps, 'b * c') + + # whether to remove single codebook dim + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, '... 1 -> ...') + + # complete aux loss + aux_loss = entropy_aux_loss * self.entropy_loss_weight + commit_loss * self.commitment_loss_weight + + # ret = Return(x, indices, aux_loss) + # return ret + # return ret, LossBreakdown(per_sample_entropy, codebook_entropy, commit_loss) + + return dict(embeddings=x, encodings=indices, commitment_loss=aux_loss) + + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq.py new file mode 100644 index 0000000000000000000000000000000000000000..ea65113563e638c320485dc1a931ec3a4d48ccaf --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq.py @@ -0,0 +1,727 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Binary Spherical Quantization +Proposed in https://arxiv.org/abs/2406.07548 + +In the simplest setup, each dimension is quantized into {-1, 1}. +An entropy penalty is used to encourage utilization. +""" + +import random +from math import log2, ceil +from functools import partial, cache +from collections import namedtuple +from contextlib import nullcontext + +import torch.distributed as dist +from torch.distributed import nn as dist_nn + +import torch +from torch import nn, einsum +import torch.nn.functional as F +from torch.nn import Module +from torch.amp import autocast +import numpy as np + +from einops import rearrange, reduce, pack, unpack + +# from einx import get_at + +from .dynamic_resolution import predefined_HW_Scales_dynamic + +# constants + +Return = namedtuple('Return', ['quantized', 'indices', 'bit_indices', 'entropy_aux_loss']) + +LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment']) + +# distributed helpers + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + + dist_nn.all_reduce(t) + t = t / dist.get_world_size() + return t + +# helper functions + +def exists(v): + return v is not None + +def identity(t): + return t + +def default(*args): + for arg in args: + if exists(arg): + return arg() if callable(arg) else arg + return None + +def round_up_multiple(num, mult): + return ceil(num / mult) * mult + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +def l2norm(t): + return F.normalize(t, dim = -1) + +# entropy + +def log(t, eps = 1e-5): + return t.clamp(min = eps).log() + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + +# cosine sim linear + +class CosineSimLinear(Module): + def __init__( + self, + dim_in, + dim_out, + scale = 1. + ): + super().__init__() + self.scale = scale + self.weight = nn.Parameter(torch.randn(dim_in, dim_out)) + + def forward(self, x): + x = F.normalize(x, dim = -1) + w = F.normalize(self.weight, dim = 0) + return (x @ w) * self.scale + + +def get_latent2scale_schedule(T: int, H: int, W: int, mode="original"): + assert mode in ["original", "dynamic", "dense", "same1", "same2", "same3"] + predefined_HW_Scales = { + # 256 * 256 + (32, 32): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 9), (13, 13), (18, 18), (24, 24), (32, 32)], + (16, 16): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)], + # 1024x1024 + (64, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (7, 7), (9, 9), (12, 12), (16, 16), (21, 21), (27, 27), (36, 36), (48, 48), (64, 64)], + + (36, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 12), (13, 16), (18, 24), (24, 32), (32, 48), (36, 64)], + } + if mode == "dynamic": + predefined_HW_Scales.update(predefined_HW_Scales_dynamic) + elif mode == "dense": + predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + elif mode.startswith("same"): + num_quant = int(mode[len("same"):]) + predefined_HW_Scales[(16, 16)] = [(16, 16) for _ in range(num_quant)] + predefined_HW_Scales[(32, 32)] = [(32, 32) for _ in range(num_quant)] + predefined_HW_Scales[(64, 64)] = [(64, 64) for _ in range(num_quant)] + + predefined_T_Scales = [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 15, 17, 17, 17, 17, 17] + patch_THW_shape_per_scale = predefined_HW_Scales[(H, W)] + if len(predefined_T_Scales) < len(patch_THW_shape_per_scale): + # print("warning: the length of predefined_T_Scales is less than the length of patch_THW_shape_per_scale!") + predefined_T_Scales += [predefined_T_Scales[-1]] * (len(patch_THW_shape_per_scale) - len(predefined_T_Scales)) + patch_THW_shape_per_scale = [(min(T, t), h, w ) for (h, w), t in zip(patch_THW_shape_per_scale, predefined_T_Scales[:len(patch_THW_shape_per_scale)])] + return patch_THW_shape_per_scale + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + normalized_shape: int + """ + def __init__(self, normalized_shape, norm_weight=False, eps=1e-6, data_format="channels_first"): + super().__init__() + if norm_weight: + self.weight = nn.Parameter(torch.ones(normalized_shape)/(normalized_shape**0.5)) + else: + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + if x.ndim == 4: # (b, c, h, w) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + elif x.ndim == 5: # (b, c, t, h, w) + x = self.weight[:, None, None, None] * x + self.bias[:, None, None, None] + else: + raise ValueError("the number of dimensions of the input should be 4 or 5") + return x + +class MultiScaleBSQ(Module): + """ Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """ + + def __init__( + self, + *, + dim, + codebook_size, + soft_clamp_input_value = None, + aux_loss = False, # intermediate auxiliary loss + ln_before_quant=False, # add a LN before multi-scale RQ + ln_init_by_sqrt=False, # weight init by 1/sqrt(d) + use_decay_factor=False, + use_stochastic_depth=False, + drop_rate=0., + schedule_mode="original", # ["original", "dynamic", "dense"] + keep_first_quant=False, + keep_last_quant=False, + remove_residual_detach=False, + random_flip = False, + flip_prob = 0.5, + flip_mode = "stochastic", # "stochastic", "deterministic" + max_flip_lvl = 1, + random_flip_1lvl = False, # random flip one level each time + flip_lvl_idx = None, + drop_when_test=False, + drop_lvl_idx=None, + drop_lvl_num=0, + casual_multi_scale=False, + **kwargs + ): + super().__init__() + codebook_dim = int(log2(codebook_size)) + + requires_projection = codebook_dim != dim + self.project_in = nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity() + self.project_out = nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity() + self.has_projections = requires_projection + self.layernorm = LayerNorm(codebook_dim, norm_weight=ln_init_by_sqrt) if ln_before_quant else nn.Identity() + self.use_stochastic_depth = use_stochastic_depth + self.drop_rate = drop_rate + self.remove_residual_detach = remove_residual_detach + self.random_flip = random_flip + self.flip_prob = flip_prob + self.flip_mode = flip_mode + self.max_flip_lvl = max_flip_lvl + self.random_flip_1lvl = random_flip_1lvl + self.flip_lvl_idx = flip_lvl_idx + assert (random_flip and random_flip_1lvl) == False + self.drop_when_test = drop_when_test + self.drop_lvl_idx = drop_lvl_idx + self.drop_lvl_num = drop_lvl_num + self.casual_multi_scale = casual_multi_scale + print(f"{casual_multi_scale=}") + if self.drop_when_test: + assert drop_lvl_idx is not None + assert drop_lvl_num > 0 + + self.lfq = BSQ( + dim = codebook_dim, + codebook_scale = 1/np.sqrt(codebook_dim), + soft_clamp_input_value = soft_clamp_input_value, + # experimental_softplus_entropy_loss=True, + # entropy_loss_offset=2, + **kwargs + ) + + self.z_interplote_up = 'trilinear' + self.z_interplote_down = 'area' + + self.use_decay_factor = use_decay_factor + self.schedule_mode = schedule_mode + self.keep_first_quant = keep_first_quant + self.keep_last_quant = keep_last_quant + if self.use_stochastic_depth and self.drop_rate > 0: + assert self.keep_first_quant or self.keep_last_quant + + @property + def codebooks(self): + return self.lfq.codebook + + def get_codes_from_indices(self, indices_list): + all_codes = [] + for indices in indices_list: + codes = self.lfq.indices_to_codes(indices) + all_codes.append(codes) + _, _, T, H, W = all_codes[-1].size() + summed_codes = 0 + for code in all_codes: + summed_codes += F.interpolate(code, size=(T, H, W), mode=self.z_interplote_up) + return summed_codes + + def get_output_from_indices(self, indices): + codes = self.get_codes_from_indices(indices) + codes_summed = reduce(codes, 'q ... -> ...', 'sum') + return self.project_out(codes_summed) + + def flip_quant(self, x): + assert self.flip_mode == 'stochastic' + flip_mask = torch.rand_like(x) < self.flip_prob + x = x.clone() + x[flip_mask] = -x[flip_mask] + return x + + def forward( + self, + x, + scale_schedule=None, + mask = None, + return_all_codes = False, + return_residual_norm_per_scale = False + ): + if x.ndim == 4: + x = x.unsqueeze(2) + B, C, T, H, W = x.size() + + if scale_schedule is None: + if self.schedule_mode.startswith("same"): + scale_num = int(self.schedule_mode[len("same"):]) + assert T == 1 + scale_schedule = [(1, H, W)] * scale_num + else: + scale_schedule = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode) + scale_num = len(scale_schedule) + + # x = self.project_in(x) + x = x.permute(0, 2, 3, 4, 1).contiguous() # (b, c, t, h, w) => (b, t, h, w, c) + x = self.project_in(x) + x = x.permute(0, 4, 1, 2, 3).contiguous() # (b, t, h, w, c) => (b, c, t, h, w) + x = self.layernorm(x) + + quantized_out = 0. + residual = x + + all_losses = [] + all_indices = [] + all_bit_indices = [] + var_inputs = [] + residual_norm_per_scale = [] + + # go through the layers + out_fact = init_out_fact = 1.0 + # residual_list = [] + # interpolate_residual_list = [] + # quantized_list = [] + if self.drop_when_test: + drop_lvl_start = self.drop_lvl_idx + drop_lvl_end = self.drop_lvl_idx + self.drop_lvl_num + scale_num = len(scale_schedule) + with autocast('cuda', enabled = False): + for si, (pt, ph, pw) in enumerate(scale_schedule): + out_fact = max(0.1, out_fact) if self.use_decay_factor else init_out_fact + if self.casual_multi_scale and (pt, ph, pw) != (T, H, W): + interpolate_residual = F.interpolate(residual[:, :, :pt, :, :], size=(pt, ph, pw), mode=self.z_interplote_down) + elif (pt, ph, pw) != (T, H, W): + interpolate_residual = F.interpolate(residual, size=(pt, ph, pw), mode=self.z_interplote_down) + else: + interpolate_residual = residual + if return_residual_norm_per_scale: + residual_norm_per_scale.append((torch.abs(interpolate_residual) < 0.05 * self.lfq.codebook_scale).sum() / interpolate_residual.numel()) + # residual_list.append(torch.norm(residual.detach(), dim=1).mean()) + # interpolate_residual_list.append(torch.norm(interpolate_residual.detach(), dim=1).mean()) + if self.training and self.use_stochastic_depth and random.random() < self.drop_rate: + if (si == 0 and self.keep_first_quant) or (si == scale_num - 1 and self.keep_last_quant): + quantized, indices, _, loss = self.lfq(interpolate_residual) + quantized = quantized * out_fact + all_indices.append(indices) + all_losses.append(loss) + else: + quantized = torch.zeros_like(interpolate_residual) + elif self.drop_when_test and drop_lvl_start <= si < drop_lvl_end: + continue + else: + # residual_norm = torch.norm(interpolate_residual.detach(), dim=1) # (b, t, h, w) + # print(si, residual_norm.min(), residual_norm.max(), residual_norm.mean()) + quantized, indices, bit_indices, loss = self.lfq(interpolate_residual) + if self.random_flip and si < self.max_flip_lvl: + quantized = self.flip_quant(quantized) + if self.random_flip_1lvl and si == self.flip_lvl_idx: + quantized = self.flip_quant(quantized) + quantized = quantized * out_fact + all_indices.append(indices) + # quantized_list.append(torch.norm(quantized.detach(), dim=1).mean()) + if (pt, ph, pw) != (T, H, W): + quantized = F.interpolate(quantized, size=(T, H, W), mode=self.z_interplote_up).contiguous() + + if self.remove_residual_detach: + residual = residual - quantized + else: + residual = residual - quantized.detach() + quantized_out = quantized_out + quantized + + all_bit_indices.append(bit_indices) + all_losses.append(loss) + if si != scale_num - 1: + var_inputs.append(F.interpolate(quantized_out, size=scale_schedule[si+1], mode=self.z_interplote_down).contiguous()) + + if self.use_decay_factor: + out_fact -= 0.1 + # print("residual_list:", residual_list) + # print("interpolate_residual_list:", interpolate_residual_list) + # print("quantized_list:", quantized_list) + # import ipdb; ipdb.set_trace() + # project out, if needed + quantized_out = quantized_out.permute(0, 2, 3, 4, 1).contiguous() # (b, c, t, h, w) => (b, t, h, w, c) + quantized_out = self.project_out(quantized_out) + quantized_out = quantized_out.permute(0, 4, 1, 2, 3).contiguous() # (b, t, h, w, c) => (b, c, t, h, w) + + # image + if quantized_out.size(2) == 1: + quantized_out = quantized_out.squeeze(2) + + # stack all losses and indices + + all_losses = torch.stack(all_losses, dim = -1) + + ret = (quantized_out, all_indices, all_bit_indices, residual_norm_per_scale, all_losses, var_inputs) + + if not return_all_codes: + return ret + + # whether to return all codes from all codebooks across layers + all_codes = self.get_codes_from_indices(all_indices) + + # will return all codes in shape (quantizer, batch, sequence length, codebook dimension) + + return (*ret, all_codes) + + +class BSQ(Module): + def __init__( + self, + *, + dim = None, + codebook_size = None, + entropy_loss_weight = 0.1, + commitment_loss_weight = 0.25, + diversity_gamma = 1., + straight_through_activation = nn.Identity(), + num_codebooks = 1, + keep_num_codebooks_dim = None, + codebook_scale = 1., # for residual LFQ, codebook scaled down by 2x at each layer + frac_per_sample_entropy = 1., # make less than 1. to only use a random fraction of the probs for per sample entropy + has_projections = None, + projection_has_bias = True, + soft_clamp_input_value = None, + cosine_sim_project_in = False, + cosine_sim_project_in_scale = None, + channel_first = None, + experimental_softplus_entropy_loss = False, + entropy_loss_offset = 5., # how much to shift the loss before softplus + spherical = True, # from https://arxiv.org/abs/2406.07548 + force_quantization_f32 = True, # will force the quantization step to be full precision + inv_temperature = 100.0, + gamma0=1.0, gamma=1.0, zeta=1.0, + preserve_norm = False, # whether to preserve the original norm info + new_quant = True, # new quant function, + mask_out = False, # mask the output as 0 in some conditions + use_out_phi = False, # use output phi network + use_out_phi_res = False, # residual out phi + **kwargs, + ): + super().__init__() + + # some assert validations + + assert exists(dim) or exists(codebook_size), 'either dim or codebook_size must be specified for LFQ' + assert not exists(codebook_size) or log2(codebook_size).is_integer(), f'your codebook size must be a power of 2 for lookup free quantization (suggested {2 ** ceil(log2(codebook_size))})' + + codebook_size = default(codebook_size, lambda: 2 ** dim) + self.codebook_size = codebook_size + + codebook_dim = int(log2(codebook_size)) + codebook_dims = codebook_dim * num_codebooks + dim = default(dim, codebook_dims) + self.codebook_dims = codebook_dims + + has_projections = default(has_projections, dim != codebook_dims) + + if cosine_sim_project_in: + cosine_sim_project_in = default(cosine_sim_project_in_scale, codebook_scale) + project_in_klass = partial(CosineSimLinear, scale = cosine_sim_project_in) + else: + project_in_klass = partial(nn.Linear, bias = projection_has_bias) + + self.project_in = project_in_klass(dim, codebook_dims) if has_projections else nn.Identity() # nn.Identity() + self.project_out = nn.Linear(codebook_dims, dim, bias = projection_has_bias) if has_projections else nn.Identity() # nn.Identity() + self.has_projections = has_projections + + self.out_phi = nn.Linear(codebook_dims, codebook_dims) if use_out_phi else nn.Identity() + self.use_out_phi_res = use_out_phi_res + if self.use_out_phi_res: + self.out_phi_scale = nn.Parameter(torch.zeros(codebook_dims), requires_grad=True) # init as zero + + self.dim = dim + self.codebook_dim = codebook_dim + self.num_codebooks = num_codebooks + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + # channel first + + self.channel_first = channel_first + + # straight through activation + + self.activation = straight_through_activation + + # For BSQ (binary spherical quantization) + if not spherical: + raise ValueError("For BSQ, spherical must be True.") + self.persample_entropy_compute = 'analytical' + self.inv_temperature = inv_temperature + self.gamma0 = gamma0 # loss weight for entropy penalty + self.gamma = gamma # loss weight for entropy penalty + self.zeta = zeta # loss weight for entire entropy penalty + self.preserve_norm = preserve_norm + self.new_quant = new_quant + self.mask_out = mask_out + + # entropy aux loss related weights + + assert 0 < frac_per_sample_entropy <= 1. + self.frac_per_sample_entropy = frac_per_sample_entropy + + self.diversity_gamma = diversity_gamma + self.entropy_loss_weight = entropy_loss_weight + + # codebook scale + + self.codebook_scale = codebook_scale + + # commitment loss + + self.commitment_loss_weight = commitment_loss_weight + + # whether to soft clamp the input value from -value to value + + self.soft_clamp_input_value = soft_clamp_input_value + assert not exists(soft_clamp_input_value) or soft_clamp_input_value >= codebook_scale + + # whether to make the entropy loss positive through a softplus (experimental, please report if this worked or not in discussions) + + self.entropy_loss_offset = entropy_loss_offset + self.experimental_softplus_entropy_loss = experimental_softplus_entropy_loss + + # for no auxiliary loss, during inference + + self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1)) + self.register_buffer('zero', torch.tensor(0.), persistent = False) + + # whether to force quantization step to be f32 + + self.force_quantization_f32 = force_quantization_f32 + + # codes + + # all_codes = torch.arange(codebook_size) + # bits = ((all_codes[..., None].int() & self.mask) != 0).float() + # codebook = self.bits_to_codes(bits) + + # self.register_buffer('codebook', codebook.float(), persistent = False) + + def bits_to_codes(self, bits): + return bits * self.codebook_scale * 2 - self.codebook_scale + + # @property + # def dtype(self): + # return self.codebook.dtype + + def indices_to_codes( + self, + indices, + label_type = 'int_label', + project_out = True + ): + assert label_type in ['int_label', 'bit_label'] + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + should_transpose = default(self.channel_first, is_img_or_video) + + if not self.keep_num_codebooks_dim: + if label_type == 'int_label': + indices = rearrange(indices, '... -> ... 1') + else: + indices = indices.unsqueeze(-2) + + # indices to codes, which are bits of either -1 or 1 + + if label_type == 'int_label': + assert indices[..., None].int().min() > 0 + bits = ((indices[..., None].int() & self.mask) != 0).float() # .to(self.dtype) + else: + bits = indices + + codes = self.bits_to_codes(bits) + + codes = l2norm(codes) # must normalize when using BSQ + + codes = rearrange(codes, '... c d -> ... (c d)') + + # whether to project codes out to original dimensions + # if the input feature dimensions were not log2(codebook size) + + if project_out: + codes = self.project_out(codes) + + # rearrange codes back to original shape + + if should_transpose: + codes = rearrange(codes, 'b ... d -> b d ...') + + return codes + + def quantize(self, z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) + return z + (zhat - z).detach() + + def quantize_new(self, z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def soft_entropy_loss(self, z): + if self.persample_entropy_compute == 'analytical': + # if self.l2_norm: + p = torch.sigmoid(-4 * z / (self.codebook_dims ** 0.5) * self.inv_temperature) + # else: + # p = torch.sigmoid(-4 * z * self.inv_temperature) + prob = torch.stack([p, 1-p], dim=-1) # (b, h, w, 18, 2) + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() # (b,h,w,18)->(b,h,w)->scalar + else: + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() + + # macro average of the probability of each subgroup + avg_prob = reduce(prob, '... g d ->g d', 'mean') # (18, 2) + codebook_entropy = self.get_entropy(avg_prob, dim=-1, normalize=False) + + # the approximation of the entropy is the sum of the entropy of each subgroup + return per_sample_entropy, codebook_entropy.sum(), avg_prob + + def get_entropy(self, count, dim=-1, eps=1e-4, normalize=True): + if normalize: # False + probs = (count + eps) / (count + eps).sum(dim=dim, keepdim =True) + else: # True + probs = count + H = -(probs * torch.log(probs + 1e-8)).sum(dim=dim) + return H + + def forward( + self, + x, + return_loss_breakdown = False, + mask = None, + entropy_weight=0.1 + ): + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension, which is also log2(codebook size) + c - number of codebook dim + """ + + is_img_or_video = x.ndim >= 4 + should_transpose = default(self.channel_first, is_img_or_video) + + # standardize image or video into (batch, seq, dimension) + + if should_transpose: + x = rearrange(x, 'b d ... -> b ... d') + x, ps = pack_one(x, 'b * d') # x.shape [b, hwt, c] + + assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}' + + x = self.project_in(x) + + # split out number of codebooks + + x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks) + + x = l2norm(x) + + # whether to force quantization step to be full precision or not + + force_f32 = self.force_quantization_f32 + + quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext + + indices = None + with quantization_context(): + + if force_f32: + orig_dtype = x.dtype + x = x.float() + + # use straight-through gradients (optionally with custom activation fn) if training + if self.new_quant: + quantized = self.quantize_new(x) + + # calculate indices + bit_indices = (quantized > 0).int() + entropy_penalty = persample_entropy = cb_entropy = self.zero + commit_loss = self.zero + + # input back to original dtype if needed + + if force_f32: + x = x.type(orig_dtype) + + # merge back codebook dim + x = quantized # rename quantized to x for output + x = rearrange(x, 'b n c d -> b n (c d)') + + # project out to feature dimension if needed + + x = self.project_out(x) + + # reconstitute image or video dimensions + + if should_transpose: + x = unpack_one(x, ps, 'b * d') + x = rearrange(x, 'b ... d -> b d ...') + + bit_indices = unpack_one(bit_indices, ps, 'b * c d') + + # whether to remove single codebook dim + + if not self.keep_num_codebooks_dim: + bit_indices = rearrange(bit_indices, '... 1 d -> ... d') + + # complete aux loss + + aux_loss = commit_loss * self.commitment_loss_weight + (self.zeta * entropy_penalty / self.inv_temperature)*entropy_weight + # returns + + ret = Return(x, indices, bit_indices, aux_loss) + + if not return_loss_breakdown: + return ret + + return ret, LossBreakdown(persample_entropy, cb_entropy, commit_loss) + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq_tp.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..2074d8470394ab42fedce795f6c3c44ecca676e1 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq_tp.py @@ -0,0 +1,841 @@ +""" +Binary Spherical Quantization +Proposed in https://arxiv.org/abs/2406.07548 + +In the simplest setup, each dimension is quantized into {-1, 1}. +An entropy penalty is used to encourage utilization. +""" +import json +import random +import copy +from math import log2, ceil +from functools import partial, cache +from collections import namedtuple +from contextlib import nullcontext + +import torch.distributed as dist +from torch.distributed import nn as dist_nn + +import torch +from torch import nn, einsum +import torch.nn.functional as F +from torch.nn import Module +from torch.amp import autocast +import numpy as np + +from einops import rearrange, reduce, pack, unpack + +# from einx import get_at + +from infinity.models.videovae.utils.dynamic_resolution import predefined_HW_Scales_dynamic +from infinity.models.videovae.utils.dynamic_resolution_two_pyramid import dynamic_resolution_thw, total_pixels2scales +from infinity.models.videovae.modules.quantizer.finite_scalar_quantization import FSQ + +# print(f"{dynamic_resolution_thw=}") + +# constants + +Return = namedtuple('Return', ['quantized', 'indices', 'entropy_aux_loss']) + +LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment']) + +# distributed helpers + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + + dist_nn.all_reduce(t) + t = t / dist.get_world_size() + return t + +# helper functions + +def exists(v): + return v is not None + +def identity(t): + return t + +def default(*args): + for arg in args: + if exists(arg): + return arg() if callable(arg) else arg + return None + +def round_up_multiple(num, mult): + return ceil(num / mult) * mult + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +def l2norm(t): + return F.normalize(t, dim = -1) + +# entropy + +def log(t, eps = 1e-5): + return t.clamp(min = eps).log() + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + +# cosine sim linear + +class CosineSimLinear(Module): + def __init__( + self, + dim_in, + dim_out, + scale = 1. + ): + super().__init__() + self.scale = scale + self.weight = nn.Parameter(torch.randn(dim_in, dim_out)) + + def forward(self, x): + x = F.normalize(x, dim = -1) + w = F.normalize(self.weight, dim = 0) + return (x @ w) * self.scale + +def repeat_schedule(scale_schedule, repeat_scales_num, times): + new_scale_schedule = [] + for i in range(repeat_scales_num): + new_scale_schedule.extend([scale_schedule[i] for _ in range(times)]) + new_scale_schedule.extend(scale_schedule[repeat_scales_num:]) + return new_scale_schedule + +def get_latent2scale_schedule(T: int, H: int, W: int, mode="original", last_scale_repeat_n=0, args=None): + predefined_HW_Scales = {} + if mode.startswith("infinity_video_two_pyramid"): + if 'elegant' in mode: + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + image_scale_repetition = json.loads(args.image_scale_repetition) + video_scale_repetition = json.loads(args.video_scale_repetition) + # print(f'{image_scale_repetition=} {video_scale_repetition=}') + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + def repeat_scales(base_scale_schedule, scale_repetition): + scale_schedule = [] + for i in range(len(base_scale_schedule)): + scale_schedule.extend([base_scale_schedule[i] for _ in range(scale_repetition[i])]) + return scale_schedule + image_scale_schedule = repeat_scales(base_scale_schedule, image_scale_repetition) + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + firstframe_scalecnt = len(image_scale_schedule) + if T > 1: + scale_schedule = repeat_scales(base_scale_schedule, video_scale_repetition) + spatial_time_schedule.extend([(T-1, h, w) for i, (_, h, w) in enumerate(scale_schedule)]) + # double h and w + tower_split_index = firstframe_scalecnt + # print(f'{spatial_time_schedule=}') + return spatial_time_schedule, tower_split_index + if "motion_boost_v2" in mode: + times = 6 + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + image_scale_schedule = repeat_schedule(base_scale_schedule, 3, times) + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + firstframe_scalecnt = len(image_scale_schedule) + if T > 1: + scale_schedule = repeat_schedule(base_scale_schedule, 7, times) + predefined_t = [T - 1 for _ in range(len(scale_schedule))] + spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), T - 1), h, w) for i, (_, h, w) in enumerate(scale_schedule)]) + # double h and w + spatial_time_schedule_double = [(t, 2*h, 2*w) for (t, h, w) in spatial_time_schedule] + tower_split_index = firstframe_scalecnt + return spatial_time_schedule_double, tower_split_index + spatial_time_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + spatial_time_schedule.extend(spatial_time_schedule[-1:] * last_scale_repeat_n) + tower_split_index = dynamic_resolution_thw[(H, W)]['tower_split_index'] + last_scale_repeat_n + if T > 1: + # predefined_t = np.linspace(1, compressed_frames - 1, len(scale_schedule)) + if mode == "infinity_video_two_pyramid_full_time": + spatial_time_schedule.extend([(T - 1, h, w) for i, (_, h, w) in enumerate(spatial_time_schedule)]) + else: + predefined_t = np.linspace(1, T - 1, total_pixels2scales['0.06M']-3).tolist() + [T - 1] * (len(spatial_time_schedule)-total_pixels2scales['0.06M']+3) + spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), T - 1), h, w) for i, (_, h, w) in enumerate(spatial_time_schedule)]) + spatial_time_schedule.extend(spatial_time_schedule[-1:] * last_scale_repeat_n) + # double h and w + spatial_time_schedule_double = [(t, 2*h, 2*w) for (t, h, w) in spatial_time_schedule] + return spatial_time_schedule_double, tower_split_index + if mode == "original": + predefined_HW_Scales = { + # 256x256 + (16, 16): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)], + (36, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 12), (13, 16), (18, 24), (24, 32), (32, 48), (36, 64)], + (18, 32): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 8), (8, 10), (10, 14), (12, 18), (14, 22), (16, 26), (18, 32)], + (30, 53): [(1, 1), (2, 2), (3, 3), (4, 7), (6, 11), (8, 14), (12, 21), (16, 28), (20, 35), (22, 39), (24, 42), (26, 46), (28, 50), (30, 53)] + } + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (64, 64)] + elif mode == "dynamic": + predefined_HW_Scales.update(predefined_HW_Scales_dynamic) + elif mode == "dense": + predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + elif mode == "dense_f8": + # predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = [(x, x) for x in range(1, 16+1)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + predefined_HW_Scales[(128, 128)] = predefined_HW_Scales[(64, 64)] + [(80, 80), (96, 96), (112, 112), (128, 128)] + elif mode == "dense_f8_double": + # predefined_HW_Scales setting double from dense f16 + predefined_HW_Scales[(32, 32)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(96, 96)] = predefined_HW_Scales[(64, 64)] + [(40, 40), (48, 48)] + predefined_HW_Scales[(128, 128)] = predefined_HW_Scales[(64, 64)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + + predefined_HW_Scales[(24, 42)] = [(1, 1), (2, 2), (3, 3), (3, 4), (3, 5), (4, 6), (4, 7), (5, 8), (6, 9), (6, 10), (6, 11), (7, 12), (7, 13), (8, 14), (9, 15), (9, 16), (12, 21)] + predefined_HW_Scales[(36, 64)] = predefined_HW_Scales[(24, 42)] + [(14, 26), (18, 32)] + predefined_HW_Scales[(60, 108)] = predefined_HW_Scales[(36, 64)] + [(24, 42), (30, 54)] + predefined_HW_Scales[(90, 160)] = predefined_HW_Scales[(60, 108)] + [(38, 66),(45, 80)] + + for k, v in predefined_HW_Scales.items(): + predefined_HW_Scales[k] = [(2*x, 2*y) for (x, y) in v] + elif mode.startswith("same"): + num_quant = int(mode[len("same"):]) + predefined_HW_Scales[(16, 16)] = [(16, 16) for _ in range(num_quant)] + predefined_HW_Scales[(32, 32)] = [(32, 32) for _ in range(num_quant)] + predefined_HW_Scales[(64, 64)] = [(64, 64) for _ in range(num_quant)] + elif mode == "half": + predefined_HW_Scales[(32, 32)] = [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)] + predefined_HW_Scales[(64, 64)] = [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16)] + else: + raise NotImplementedError + + # predefined_T_Scales = [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 17, 17, 17, 17, 17, 17] + # predefined_T_Scales = [1, 2, 3, 4, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27] + predefined_T_Scales = [1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29] + # predefined_T_Scales = [1, 2, 3, 5, 6, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] + patch_THW_shape_per_scale = predefined_HW_Scales[(H, W)] + if len(predefined_T_Scales) < len(patch_THW_shape_per_scale): + # print("warning: the length of predefined_T_Scales is less than the length of patch_THW_shape_per_scale!") + predefined_T_Scales += [predefined_T_Scales[-1]] * (len(patch_THW_shape_per_scale) - len(predefined_T_Scales)) + patch_THW_shape_per_scale = [(min(T, t), h, w ) for (h, w), t in zip(patch_THW_shape_per_scale, predefined_T_Scales[:len(patch_THW_shape_per_scale)])] + return patch_THW_shape_per_scale + +def interpolate(tensor, size, mode): + """ + arguments: + tensor: (B,C,T,H,W) + size: (C1,T,H1,W1) + mode: str + return: + tensor: (B,*size) + """ + C1, T, H1, W1 = size + tensor = tensor.permute(0,2,1,3,4) # (B,C,T,H,W) -> (B,T,C,H,W) + tensor = F.interpolate(tensor, size=(C1, H1, W1), mode=mode) + tensor = tensor.permute(0,2,1,3,4) # (B,T,C1,H1,W1) -> (B,C1,T,H1,W1) + return tensor + +# TP: Two Pyramid +class MultiScaleBSQTP(Module): + """ Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """ + + def __init__( + self, + *, + dim, + soft_clamp_input_value = None, + aux_loss = False, # intermediate auxiliary loss + use_stochastic_depth=False, + drop_rate=0., + schedule_mode="original", # ["original", "dynamic", "dense"] + keep_first_quant=False, + keep_last_quant=False, + remove_residual_detach=False, + random_flip = False, + flip_prob = 0.5, + flip_mode = "stochastic", # "stochastic", "deterministic" + max_flip_lvl = 1, + random_flip_1lvl = False, # random flip one level each time + flip_lvl_idx = None, + drop_when_test=False, + drop_lvl_idx=None, + drop_lvl_num=0, + random_short_schedule = False, # randomly use short schedule (schedule for images of 256x256) + short_schedule_prob = 0.5, + disable_flip_prob = 0.0, # disable random flip in this image + casual_multi_scale = False, # causal multiscale + temporal_slicing = False, + last_scale_repeat_n = 0, + num_lvl_fsq = None, + other_args = None, + **kwargs + ): + super().__init__() + codebook_dim = dim + self.use_stochastic_depth = use_stochastic_depth + self.drop_rate = drop_rate + self.remove_residual_detach = remove_residual_detach + self.random_flip = random_flip + self.flip_prob = flip_prob + self.flip_mode = flip_mode + self.max_flip_lvl = max_flip_lvl + self.random_flip_1lvl = random_flip_1lvl + self.flip_lvl_idx = flip_lvl_idx + assert (random_flip and random_flip_1lvl) == False + self.disable_flip_prob = disable_flip_prob + self.casual_multi_scale = casual_multi_scale + self.temporal_slicing = temporal_slicing + self.last_scale_repeat_n = last_scale_repeat_n + # print(f"{casual_multi_scale=}") + + self.drop_when_test = drop_when_test + self.drop_lvl_idx = drop_lvl_idx + self.drop_lvl_num = drop_lvl_num + if self.drop_when_test: + assert drop_lvl_idx is not None + assert drop_lvl_num > 0 + self.random_short_schedule = random_short_schedule + self.short_schedule_prob = short_schedule_prob + self.z_interplote_up = 'trilinear' + self.z_interplote_down = 'area' + + self.schedule_mode = schedule_mode + self.keep_first_quant = keep_first_quant + self.keep_last_quant = keep_last_quant + if self.use_stochastic_depth and self.drop_rate > 0: + assert self.keep_first_quant or self.keep_last_quant + + self.full2short = {7:7, 10:7, 13:7, 16:16, 20:16, 24:16} + if self.schedule_mode == 'dense_f8': + self.full2short_f8 = {20:20, 24:24, 28:24} + elif self.schedule_mode == 'dense_f8_double': + self.full2short_f8 = {16: 14, 17: 14, 19: 14, 20:14, 21:14, 22:14, 24:14} + elif self.schedule_mode.startswith("infinity_video_two_pyramid"): + self.full2short_f8 = {11: 11, 13: 11, 14: 11, 16: 11, 29: 26, 28: 26, 26: 26} + + self.other_args = other_args + print(f'{self.other_args=}') + self.origin_C = self.other_args.detail_scale_dim + self.detail_scale_dim, self.semantic_scale_dim = self.other_args.detail_scale_dim, self.other_args.semantic_scale_dim + self.semantic_scales = other_args.semantic_scales + + if self.other_args.semantic_num_lvl == 2: + self.lfq_semantic = BSQ( + dim = self.semantic_scale_dim, + codebook_scale = 1, + soft_clamp_input_value = soft_clamp_input_value, + **kwargs, + ) + else: + assert self.other_args.semantic_num_lvl >= 2, f'{self.other_args.semantic_num_lvl=} is not supported' + self.lfq_semantic = FSQ( + num_lvl = self.other_args.semantic_num_lvl, + dim = self.semantic_scale_dim, + ) + if self.other_args.detail_num_lvl == 2: + self.lfq_detail = BSQ( + dim = self.detail_scale_dim, + codebook_scale = 1, + soft_clamp_input_value = soft_clamp_input_value, + **kwargs, + ) + else: + assert self.other_args.detail_num_lvl >= 2, f'{self.other_args.semantic_num_lvl=} is not supported' + self.lfq_detail = FSQ( + num_lvl = self.other_args.detail_num_lvl, + dim = self.detail_scale_dim, + ) + + @property + def codebooks(self): + return self.lfq_detail.codebook + + def get_codes_from_indices(self, indices_list): + all_codes = [] + for indices in indices_list: + # indices: [B,t,h,w,d] + if indices.shape[-1] == self.origin_C: + codes = self.lfq.indices_to_codes(indices) + elif indices.shape[-1] == self.semantic_scale_dim: + codes = self.lfq_semantic.indices_to_codes(indices) + else: + raise NotImplementedError(f'indices shape {indices.shape} not supported') + all_codes.append(codes) + _, _, T, H, W = all_codes[-1].size() + summed_codes = 0 + for code in all_codes: + summed_codes += F.interpolate(code, size=(T, H, W), mode=self.z_interplote_up) + return summed_codes + + def get_output_from_indices(self, indices): + codes = self.get_codes_from_indices(indices) + codes_summed = reduce(codes, 'q ... -> ...', 'sum') + return codes_summed + + def flip_quant(self, x): + # assert self.flip_mode in ['stochastic', 'stochastic_dynamic'] + if self.flip_mode == 'stochastic': + flip_mask = torch.rand_like(x) < self.flip_prob + elif self.flip_mode == 'stochastic_dynamic': + flip_prob = random.uniform(0, self.flip_prob) + flip_mask = torch.rand_like(x) < flip_prob + else: + raise NotImplementedError + x = x.clone() + x[flip_mask] = -x[flip_mask] + return x + + def forward( + self, + x_list, + mask = None, + return_all_codes = False, + ): + assert len(x_list) <= 2 + multi_scale = len(x_list) == 2 + for i in range(len(x_list)): + if x_list[i].ndim == 4: + x_list[i] = x_list[i].unsqueeze(2) + B, C, T, H, W = x_list[-1].size() + + if self.schedule_mode.startswith("same"): + scale_num = int(self.schedule_mode[len("same"):]) + assert T == 1 + scale_schedule = [(1, H, W)] * scale_num + elif self.schedule_mode.startswith("infinity_video_two_pyramid"): + scale_schedule, tower_split_index = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode, last_scale_repeat_n=self.last_scale_repeat_n, args=self.other_args) + scale_num = len(scale_schedule) + else: + scale_schedule = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode, args=self.other_args) + scale_num = len(scale_schedule) + + quantized_out = torch.zeros((B, C, 1, 1, 1), device=x_list[-1].device, dtype=x_list[-1].dtype) + quantized_out_firstframe = None + + all_losses = [] + all_indices = [] + + # cal scale_in_one_clip + unique_scale_schedule = [scale_schedule[0]] + scale_in_one_clip = 1 + for si in range(1, len(scale_schedule)): + if np.array(scale_schedule[si]).prod() < np.array(scale_schedule[si-1]).prod(): + break + if scale_schedule[si] != scale_schedule[si-1]: + unique_scale_schedule.append(scale_schedule[si]) + scale_in_one_clip += 1 + + current_scale_in_one_clip = 0 + must_preserve_scales = [] + if self.other_args.quant_not_rely_256: + must_preserve_scales = [11] + with autocast('cuda', enabled = False): + for si, (pt, ph, pw) in enumerate(scale_schedule): + if si > 0 and scale_schedule[si] != scale_schedule[si-1]: + current_scale_in_one_clip += 1 + current_scale_in_one_clip = current_scale_in_one_clip % scale_in_one_clip + + last_step_in_one_scale = False + if si < len(scale_schedule)-1 and (scale_schedule[si] != scale_schedule[si+1]): + last_step_in_one_scale = True + if si == len(scale_schedule)-1: + last_step_in_one_scale = True + + if si < tower_split_index: + ss, ee = 0, 1 + else: + ss, ee = 1, T + + if multi_scale and current_scale_in_one_clip < self.other_args.scales_256: + target = x_list[0][:,:,ss:ee] + else: + target = x_list[-1][:,:,ss:ee] + tgt_shape = target.shape[-4:] + + skip_this_scale = False + if current_scale_in_one_clip < self.semantic_scales: + C1 = self.semantic_scale_dim + lfq = self.lfq_semantic + else: + C1 = self.detail_scale_dim + lfq = self.lfq_detail + if current_scale_in_one_clip not in must_preserve_scales: + skip_this_scale = random.random() < self.other_args.skip_detail_scales_prob + + if not skip_this_scale: + quantized_out = interpolate(quantized_out, size=tgt_shape, mode=self.z_interplote_up) + interpolate_residual = interpolate(target-quantized_out, size=(C1, pt, ph, pw), mode=self.z_interplote_down) + quantized, indices, loss = lfq(interpolate_residual) + quantized = interpolate(quantized, size=tgt_shape, mode=self.z_interplote_up) + all_indices.append(indices) + all_losses.append(loss) + quantized_out = quantized_out + quantized + + if si == tower_split_index - 1: + quantized_out_firstframe = quantized_out.clone() + quantized_out = quantized_out * 0. # set to zero + + if multi_scale and si < tower_split_index and last_step_in_one_scale and current_scale_in_one_clip == self.other_args.scales_256-1: + quantized_out_firstframe_256 = quantized_out.clone() + if self.other_args.quant_not_rely_256: + quantized_out = quantized_out * 0. + if multi_scale and si >= tower_split_index and last_step_in_one_scale and current_scale_in_one_clip == self.other_args.scales_256-1: + quantized_out_256 = quantized_out.clone() + if self.other_args.quant_not_rely_256: + quantized_out = quantized_out * 0. + + quantized_out_list = [] + if T == 1: + if multi_scale: + quantized_out_list.append(quantized_out_firstframe_256) + quantized_out_list.append(quantized_out_firstframe) + else: + quantized_out_list.append(quantized_out_firstframe) + else: + if multi_scale: + quantized_out_256 = torch.cat([quantized_out_firstframe_256, quantized_out_256], dim=2) + quantized_out_list.append(quantized_out_256) + quantized_out = torch.cat([quantized_out_firstframe, quantized_out], dim=2) + quantized_out_list.append(quantized_out) + else: + quantized_out = torch.cat([quantized_out_firstframe, quantized_out], dim=2) + quantized_out_list.append(quantized_out) + + all_losses = torch.stack(all_losses, dim = -1) + + ret = (quantized_out_list, all_indices, all_losses) + + if not return_all_codes: + return ret + + # whether to return all codes from all codebooks across layers + all_codes = self.get_codes_from_indices(all_indices) + + # will return all codes in shape (quantizer, batch, sequence length, codebook dimension) + + return (*ret, all_codes) + + +class BSQ(Module): + def __init__( + self, + *, + dim = None, + entropy_loss_weight = 0.1, + commitment_loss_weight = 0.25, + num_codebooks = 1, + keep_num_codebooks_dim = None, + codebook_scale = 1., # for residual LFQ, codebook scaled down by 2x at each layer + frac_per_sample_entropy = 1., # make less than 1. to only use a random fraction of the probs for per sample entropy + soft_clamp_input_value = None, + channel_first = None, + experimental_softplus_entropy_loss = False, + entropy_loss_offset = 5., # how much to shift the loss before softplus + spherical = True, # from https://arxiv.org/abs/2406.07548 + force_quantization_f32 = True, # will force the quantization step to be full precision + inv_temperature = 100.0, + gamma0=1.0, gamma=1.0, zeta=1.0, + use_out_phi = False, # use output phi network + use_out_phi_res = False, # residual out phi + use_bernoulli = False, + use_rot_trick = False, + ): + super().__init__() + + # some assert validations + assert exists(dim) , 'dim must be specified for BSQ' + + codebook_dim = dim + codebook_dims = codebook_dim * num_codebooks + dim = default(dim, codebook_dims) + self.codebook_dims = codebook_dims + + self.out_phi = nn.Linear(codebook_dims, codebook_dims) if use_out_phi else nn.Identity() + self.use_out_phi_res = use_out_phi_res + if self.use_out_phi_res: + self.out_phi_scale = nn.Parameter(torch.zeros(codebook_dims), requires_grad=True) # init as zero + + self.dim = dim + self.codebook_dim = codebook_dim + self.num_codebooks = num_codebooks + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + # channel first + self.channel_first = channel_first + + # For BSQ (binary spherical quantization) + if not spherical: + raise ValueError("For BSQ, spherical must be True.") + self.persample_entropy_compute = 'analytical' + self.inv_temperature = inv_temperature + self.gamma0 = gamma0 # loss weight for entropy penalty + self.gamma = gamma # loss weight for entropy penalty + self.zeta = zeta # loss weight for entire entropy penalty + self.use_bernoulli = use_bernoulli + self.use_rot_trick = use_rot_trick + + # entropy aux loss related weights + + assert 0 < frac_per_sample_entropy <= 1. + self.frac_per_sample_entropy = frac_per_sample_entropy + + self.entropy_loss_weight = entropy_loss_weight + + # codebook scale + + self.codebook_scale = codebook_scale + + # commitment loss + + self.commitment_loss_weight = commitment_loss_weight + + # whether to soft clamp the input value from -value to value + + self.soft_clamp_input_value = soft_clamp_input_value + assert not exists(soft_clamp_input_value) or soft_clamp_input_value >= codebook_scale + + # whether to make the entropy loss positive through a softplus (experimental, please report if this worked or not in discussions) + + self.entropy_loss_offset = entropy_loss_offset + self.experimental_softplus_entropy_loss = experimental_softplus_entropy_loss + + # for no auxiliary loss, during inference + + self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1)) + self.register_buffer('zero', torch.tensor(0.), persistent = False) + + # whether to force quantization step to be f32 + + self.force_quantization_f32 = force_quantization_f32 + + def bits_to_codes(self, bits): + return bits * self.codebook_scale * 2 - self.codebook_scale + + # @property + # def dtype(self): + # return self.codebook.dtype + + def indices_to_codes( + self, + indices, + project_out = True + ): + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + should_transpose = default(self.channel_first, is_img_or_video) + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, '... -> ... 1') + + # indices to codes, which are bits of either -1 or 1 + + bits = ((indices[..., None].int() & self.mask) != 0).to(self.dtype) + + codes = self.bits_to_codes(bits) + + codes = l2norm(codes) # must normalize when using BSQ + + codes = rearrange(codes, '... c d -> ... (c d)') + + # whether to project codes out to original dimensions + # if the input feature dimensions were not log2(codebook size) + + # rearrange codes back to original shape + + if should_transpose: + codes = rearrange(codes, 'b ... d -> b d ...') + + return codes + + def quantize(self, z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def quantize_new_bernoulli(self, z, prob_z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = (torch.bernoulli(prob_z) - 0.5) * 2.0 + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def rot_quantize(self, z, inference=False): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) * q_scale + if inference: + return zhat + + w = ((z + zhat) / torch.norm(z + zhat, dim=-1, keepdim=True)).detach() + z = z.unsqueeze(1) - 2*torch.bmm(torch.bmm(z.unsqueeze(1), w.unsqueeze(-1)), w.unsqueeze(1)) + 2 * torch.bmm( + torch.bmm(z.unsqueeze(1), z.unsqueeze(-1).detach()), zhat.unsqueeze(1).detach()) + return z.squeeze() + + def soft_entropy_loss(self, z): + if self.persample_entropy_compute == 'analytical': + # if self.l2_norm: + p = torch.sigmoid(-4 * z / (self.codebook_dims ** 0.5) * self.inv_temperature) + # else: + # p = torch.sigmoid(-4 * z * self.inv_temperature) + prob = torch.stack([p, 1-p], dim=-1) # (b, h, w, 18, 2) + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() # (b,h,w,18)->(b,h,w)->scalar + else: + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() + + # macro average of the probability of each subgroup + avg_prob = reduce(prob, '... g d ->g d', 'mean') # (18, 2) + codebook_entropy = self.get_entropy(avg_prob, dim=-1, normalize=False) + + # the approximation of the entropy is the sum of the entropy of each subgroup + return per_sample_entropy, codebook_entropy.sum(), avg_prob + + def get_entropy(self, count, dim=-1, eps=1e-4, normalize=True): + if normalize: # False + probs = (count + eps) / (count + eps).sum(dim=dim, keepdim =True) + else: # True + probs = count + H = -(probs * torch.log(probs + 1e-8)).sum(dim=dim) + return H + + def forward( + self, + x, + return_loss_breakdown = False, + mask = None, + entropy_weight=0.1 + ): + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension, which is also log2(codebook size) + c - number of codebook dim + """ + + is_img_or_video = x.ndim >= 4 + should_transpose = default(self.channel_first, is_img_or_video) + + # standardize image or video into (batch, seq, dimension) + + if should_transpose: + x = rearrange(x, 'b d ... -> b ... d') + x, ps = pack_one(x, 'b * d') # x.shape [b, hwt, c] + + assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}' + + # split out number of codebooks + + x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks) + + if self.use_bernoulli: # False + prob_x = torch.sigmoid(x) + + x = l2norm(x) + + # whether to force quantization step to be full precision or not + + force_f32 = self.force_quantization_f32 + + quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext + + with quantization_context(): + + if force_f32: + orig_dtype = x.dtype + x = x.float() + + # use straight-through gradients + if self.use_rot_trick: # False + x_f = x.flatten(end_dim=-2) # (b, hwt, 1, d) -> (bhwt, d) + q_f = self.rot_quantize(x_f, inference= not self.training) + quantized = q_f.reshape(x.shape) + elif self.use_bernoulli: # False + quantized = self.quantize_new_bernoulli(x, prob_x) + else: + quantized = self.quantize(x) + + # calculate indices + indices = reduce((quantized > 0).int() * self.mask.int(), 'b n c d -> b n c', 'sum') + + # entropy aux loss + if self.training: + persample_entropy, cb_entropy, avg_prob = self.soft_entropy_loss(x) # compute entropy + entropy_penalty = self.gamma0 * persample_entropy - self.gamma * cb_entropy + else: + # if not training, just return dummy 0 + entropy_penalty = persample_entropy = cb_entropy = self.zero + + # commit loss + + if self.training and self.commitment_loss_weight > 0.: + + commit_loss = F.mse_loss(x, quantized.detach(), reduction = 'none') + + if exists(mask): + commit_loss = commit_loss[mask] + + commit_loss = commit_loss.mean() + else: + commit_loss = self.zero + + # input back to original dtype if needed + + if force_f32: + x = x.type(orig_dtype) + + # merge back codebook dim + x = quantized # rename quantized to x for output + + if self.use_out_phi_res: # False + x = x + self.out_phi_scale * self.out_phi(x) # apply out_phi on quant output as residual + else: + x = self.out_phi(x) # apply out_phi on quant output + + x = rearrange(x, 'b n c d -> b n (c d)') + + # reconstitute image or video dimensions + + if should_transpose: + x = unpack_one(x, ps, 'b * d') + x = rearrange(x, 'b ... d -> b d ...') + + indices = unpack_one(indices, ps, 'b * c') + + # whether to remove single codebook dim + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, '... 1 -> ...') + + # complete aux loss + + aux_loss = commit_loss * self.commitment_loss_weight + (self.zeta * entropy_penalty / self.inv_temperature)*entropy_weight + # returns + + ret = Return(x, indices, aux_loss) + + if not return_loss_breakdown: + return ret + + return ret, LossBreakdown(persample_entropy, cb_entropy, commit_loss) + + +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq_tp_absorb_patchify.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq_tp_absorb_patchify.py new file mode 100644 index 0000000000000000000000000000000000000000..c1434a7c511b4460bc3600ebd59286dcb54c4bc1 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_bsq_tp_absorb_patchify.py @@ -0,0 +1,878 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +""" +Binary Spherical Quantization +Proposed in https://arxiv.org/abs/2406.07548 + +In the simplest setup, each dimension is quantized into {-1, 1}. +An entropy penalty is used to encourage utilization. +""" + +import random +import copy +from math import log2, ceil +from functools import partial, cache +from collections import namedtuple +from contextlib import nullcontext + +import torch.distributed as dist +from torch.distributed import nn as dist_nn + +import torch +from torch import nn, einsum +import torch.nn.functional as F +from torch.nn import Module +from torch.amp import autocast +import numpy as np + +from einops import rearrange, reduce, pack, unpack + +# from einx import get_at + +from infinity.models.videovae.utils.dynamic_resolution import predefined_HW_Scales_dynamic +from infinity.models.videovae.utils.dynamic_resolution_two_pyramid import dynamic_resolution_thw, total_pixels2scales + +# print(f"{dynamic_resolution_thw=}") + +# constants + +Return = namedtuple('Return', ['quantized', 'indices', 'bit_indices', 'entropy_aux_loss']) + +LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment']) + +# distributed helpers + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + + dist_nn.all_reduce(t) + t = t / dist.get_world_size() + return t + +# helper functions + +def exists(v): + return v is not None + +def identity(t): + return t + +def default(*args): + for arg in args: + if exists(arg): + return arg() if callable(arg) else arg + return None + +def round_up_multiple(num, mult): + return ceil(num / mult) * mult + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +def l2norm(t): + return F.normalize(t, dim = -1) + +# entropy + +def log(t, eps = 1e-5): + return t.clamp(min = eps).log() + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + +# cosine sim linear + +class CosineSimLinear(Module): + def __init__( + self, + dim_in, + dim_out, + scale = 1. + ): + super().__init__() + self.scale = scale + self.weight = nn.Parameter(torch.randn(dim_in, dim_out)) + + def forward(self, x): + x = F.normalize(x, dim = -1) + w = F.normalize(self.weight, dim = 0) + return (x @ w) * self.scale + +def repeat_schedule(scale_schedule, repeat_scales_num, times): + new_scale_schedule = [] + for i in range(repeat_scales_num): + new_scale_schedule.extend([scale_schedule[i] for _ in range(times)]) + new_scale_schedule.extend(scale_schedule[repeat_scales_num:]) + return new_scale_schedule + +def get_latent2scale_schedule(T: int, H: int, W: int, mode="original", last_scale_repeat_n=0): + assert mode in ["original", "dynamic", "dense", "same1", "same2", "same3", "half", "dense_f8", 'dense_f8_double', \ + "infinity_video_two_pyramid", "infinity_video_two_pyramid_full_time", "infinity_video_two_pyramid_full_time_motion_boost_v2"] + predefined_HW_Scales = {} + if mode.startswith("infinity_video_two_pyramid"): + if "motion_boost_v2" in mode: + times = 6 + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H//2, W//2)]['scales']) + image_scale_schedule = repeat_schedule(base_scale_schedule, 3, times) + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + firstframe_scalecnt = len(image_scale_schedule) + if T > 1: + scale_schedule = repeat_schedule(base_scale_schedule, 7, times) + predefined_t = [T - 1 for _ in range(len(scale_schedule))] + spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), T - 1), h, w) for i, (_, h, w) in enumerate(scale_schedule)]) + # double h and w + spatial_time_schedule_double = [(t, 2*h, 2*w) for (t, h, w) in spatial_time_schedule] + tower_split_index = firstframe_scalecnt + return spatial_time_schedule_double, tower_split_index + spatial_time_schedule = copy.deepcopy(dynamic_resolution_thw[(H//2, W//2)]['scales']) + spatial_time_schedule.extend(spatial_time_schedule[-1:] * last_scale_repeat_n) + tower_split_index = dynamic_resolution_thw[(H//2, W//2)]['tower_split_index'] + last_scale_repeat_n + if T > 1: + # predefined_t = np.linspace(1, compressed_frames - 1, len(scale_schedule)) + if mode == "infinity_video_two_pyramid_full_time": + spatial_time_schedule.extend([(T - 1, h, w) for i, (_, h, w) in enumerate(spatial_time_schedule)]) + else: + predefined_t = np.linspace(1, T - 1, total_pixels2scales['0.06M']-3).tolist() + [T - 1] * (len(spatial_time_schedule)-total_pixels2scales['0.06M']+3) + spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), T - 1), h, w) for i, (_, h, w) in enumerate(spatial_time_schedule)]) + spatial_time_schedule.extend(spatial_time_schedule[-1:] * last_scale_repeat_n) + # double h and w + spatial_time_schedule_double = [(t, 2*h, 2*w) for (t, h, w) in spatial_time_schedule] + return spatial_time_schedule_double, tower_split_index + if mode == "original": + predefined_HW_Scales = { + # 256x256 + (16, 16): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)], + (36, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 12), (13, 16), (18, 24), (24, 32), (32, 48), (36, 64)], + (18, 32): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 8), (8, 10), (10, 14), (12, 18), (14, 22), (16, 26), (18, 32)], + (30, 53): [(1, 1), (2, 2), (3, 3), (4, 7), (6, 11), (8, 14), (12, 21), (16, 28), (20, 35), (22, 39), (24, 42), (26, 46), (28, 50), (30, 53)] + } + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (64, 64)] + elif mode == "dynamic": + predefined_HW_Scales.update(predefined_HW_Scales_dynamic) + elif mode == "dense": + predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + elif mode == "dense_f8": + # predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = [(x, x) for x in range(1, 16+1)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + predefined_HW_Scales[(128, 128)] = predefined_HW_Scales[(64, 64)] + [(80, 80), (96, 96), (112, 112), (128, 128)] + elif mode == "dense_f8_double": + # predefined_HW_Scales setting double from dense f16 + predefined_HW_Scales[(32, 32)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(96, 96)] = predefined_HW_Scales[(64, 64)] + [(40, 40), (48, 48)] + predefined_HW_Scales[(128, 128)] = predefined_HW_Scales[(64, 64)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + + predefined_HW_Scales[(24, 42)] = [(1, 1), (2, 2), (3, 3), (3, 4), (3, 5), (4, 6), (4, 7), (5, 8), (6, 9), (6, 10), (6, 11), (7, 12), (7, 13), (8, 14), (9, 15), (9, 16), (12, 21)] + predefined_HW_Scales[(36, 64)] = predefined_HW_Scales[(24, 42)] + [(14, 26), (18, 32)] + predefined_HW_Scales[(60, 108)] = predefined_HW_Scales[(36, 64)] + [(24, 42), (30, 54)] + predefined_HW_Scales[(90, 160)] = predefined_HW_Scales[(60, 108)] + [(38, 66),(45, 80)] + + for k, v in predefined_HW_Scales.items(): + predefined_HW_Scales[k] = [(2*x, 2*y) for (x, y) in v] + elif mode.startswith("same"): + num_quant = int(mode[len("same"):]) + predefined_HW_Scales[(16, 16)] = [(16, 16) for _ in range(num_quant)] + predefined_HW_Scales[(32, 32)] = [(32, 32) for _ in range(num_quant)] + predefined_HW_Scales[(64, 64)] = [(64, 64) for _ in range(num_quant)] + elif mode == "half": + predefined_HW_Scales[(32, 32)] = [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)] + predefined_HW_Scales[(64, 64)] = [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16)] + else: + raise NotImplementedError + + # predefined_T_Scales = [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 17, 17, 17, 17, 17, 17] + # predefined_T_Scales = [1, 2, 3, 4, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27] + predefined_T_Scales = [1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29] + # predefined_T_Scales = [1, 2, 3, 5, 6, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] + patch_THW_shape_per_scale = predefined_HW_Scales[(H, W)] + if len(predefined_T_Scales) < len(patch_THW_shape_per_scale): + # print("warning: the length of predefined_T_Scales is less than the length of patch_THW_shape_per_scale!") + predefined_T_Scales += [predefined_T_Scales[-1]] * (len(patch_THW_shape_per_scale) - len(predefined_T_Scales)) + patch_THW_shape_per_scale = [(min(T, t), h, w ) for (h, w), t in zip(patch_THW_shape_per_scale, predefined_T_Scales[:len(patch_THW_shape_per_scale)])] + return patch_THW_shape_per_scale + +# TP: Two Pyramid +class MultiScaleBSQTP(Module): + """ Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """ + + def __init__( + self, + *, + dim, + soft_clamp_input_value = None, + aux_loss = False, # intermediate auxiliary loss + use_stochastic_depth=False, + drop_rate=0., + schedule_mode="original", # ["original", "dynamic", "dense"] + keep_first_quant=False, + keep_last_quant=False, + remove_residual_detach=False, + random_flip = False, + flip_prob = 0.5, + flip_mode = "stochastic", # "stochastic", "deterministic" + max_flip_lvl = 1, + random_flip_1lvl = False, # random flip one level each time + flip_lvl_idx = None, + drop_when_test=False, + drop_lvl_idx=None, + drop_lvl_num=0, + random_short_schedule = False, # randomly use short schedule (schedule for images of 256x256) + short_schedule_prob = 0.5, + disable_flip_prob = 0.0, # disable random flip in this image + casual_multi_scale = False, # causal multiscale + temporal_slicing = False, + last_scale_repeat_n = 0, + num_lvl_fsq = None, + other_args = None, + **kwargs + ): + super().__init__() + codebook_dim = dim + self.use_stochastic_depth = use_stochastic_depth + self.drop_rate = drop_rate + self.remove_residual_detach = remove_residual_detach + self.random_flip = random_flip + self.flip_prob = flip_prob + self.flip_mode = flip_mode + self.max_flip_lvl = max_flip_lvl + self.random_flip_1lvl = random_flip_1lvl + self.flip_lvl_idx = flip_lvl_idx + assert (random_flip and random_flip_1lvl) == False + self.disable_flip_prob = disable_flip_prob + self.casual_multi_scale = casual_multi_scale + self.temporal_slicing = temporal_slicing + self.last_scale_repeat_n = last_scale_repeat_n + # print(f"{casual_multi_scale=}") + + self.drop_when_test = drop_when_test + self.drop_lvl_idx = drop_lvl_idx + self.drop_lvl_num = drop_lvl_num + if self.drop_when_test: + assert drop_lvl_idx is not None + assert drop_lvl_num > 0 + self.random_short_schedule = random_short_schedule + self.short_schedule_prob = short_schedule_prob + self.z_interplote_up = 'trilinear' + self.z_interplote_down = 'area' + + self.schedule_mode = schedule_mode + self.keep_first_quant = keep_first_quant + self.keep_last_quant = keep_last_quant + if self.use_stochastic_depth and self.drop_rate > 0: + assert self.keep_first_quant or self.keep_last_quant + + self.full2short = {7:7, 10:7, 13:7, 16:16, 20:16, 24:16} + if self.schedule_mode == 'dense_f8': + self.full2short_f8 = {20:20, 24:24, 28:24} + elif self.schedule_mode == 'dense_f8_double': + self.full2short_f8 = {16: 14, 17: 14, 19: 14, 20:14, 21:14, 22:14, 24:14} + elif self.schedule_mode.startswith("infinity_video_two_pyramid"): + self.full2short_f8 = {11: 11, 13: 11, 14: 11, 16: 11, 29: 26, 28: 26, 26: 26} + + self.other_args = other_args + self.origin_C = 64 + self.detail_scale_dim, self.semantic_scale_dim = self.other_args.detail_scale_dim, self.other_args.semantic_scale_dim + self.lfq_semantic = BSQ( + dim = self.semantic_scale_dim, + codebook_scale = 1, + soft_clamp_input_value = soft_clamp_input_value, + **kwargs, + ) + self.lfq_detail = BSQ( + dim = self.detail_scale_dim, + codebook_scale = 1, + soft_clamp_input_value = soft_clamp_input_value, + **kwargs, + ) + + self.detail_scale_min_tokens = other_args.detail_scale_min_tokens # include + + # Always create projection layers to handle dimension mismatches + # Use learnable multi-layer projections if enabled, otherwise use simple linear projections + if self.other_args.use_learnable_dim_proj: + middle_hidden_dim=64 + self.semantic_proj_down = nn.Sequential( + nn.Linear(self.origin_C, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.semantic_scale_dim), + ) + self.semantic_proj_up = nn.Sequential( + nn.Linear(self.semantic_scale_dim, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.origin_C), + ) + else: + # Simple linear projections when learnable_dim_proj is disabled + self.semantic_proj_down = nn.Linear(self.origin_C, self.semantic_scale_dim) + self.semantic_proj_up = nn.Linear(self.semantic_scale_dim, self.origin_C) + + assert self.detail_scale_dim >= self.origin_C + if self.detail_scale_dim == self.origin_C: + self.detail_proj_up, self.detail_proj_down = nn.Identity(), nn.Identity() + else: + if self.other_args.use_learnable_dim_proj: + middle_hidden_dim=64 + self.detail_proj_up = nn.Sequential( + nn.Linear(self.origin_C, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.detail_scale_dim), + ) + self.detail_proj_down = nn.Sequential( + nn.Linear(self.detail_scale_dim, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.origin_C), + ) + else: + # Simple linear projections when learnable_dim_proj is disabled + self.detail_proj_up = nn.Linear(self.origin_C, self.detail_scale_dim) + self.detail_proj_down = nn.Linear(self.detail_scale_dim, self.origin_C) + + @property + def codebooks(self): + return self.lfq_detail.codebook + + def get_codes_from_indices(self, indices_list): + all_codes = [] + for indices in indices_list: + # Determine which quantizer to use based on indices shape + # indices: [B, t, h, w, d] or similar + if len(indices.shape) == 5 and indices.shape[-1] == self.origin_C: + # This shouldn't happen with the two-quantizer setup, but handle it + codes = self.lfq_detail.indices_to_codes(indices) + elif len(indices.shape) == 5 and indices.shape[-1] == self.semantic_scale_dim: + codes = self.lfq_semantic.indices_to_codes(indices) + elif len(indices.shape) == 5 and indices.shape[-1] == self.detail_scale_dim: + codes = self.lfq_detail.indices_to_codes(indices) + else: + # Default to detail quantizer + codes = self.lfq_detail.indices_to_codes(indices) + all_codes.append(codes) + _, _, T, H, W = all_codes[-1].size() + summed_codes = 0 + for code in all_codes: + summed_codes += F.interpolate(code, size=(T, H, W), mode=self.z_interplote_up) + return summed_codes + + def get_output_from_indices(self, indices): + codes = self.get_codes_from_indices(indices) + codes_summed = reduce(codes, 'q ... -> ...', 'sum') + return codes_summed + + def flip_quant(self, x): + # assert self.flip_mode in ['stochastic', 'stochastic_dynamic'] + if self.flip_mode == 'stochastic': + flip_mask = torch.rand_like(x) < self.flip_prob + elif self.flip_mode == 'stochastic_dynamic': + flip_prob = random.uniform(0, self.flip_prob) + flip_mask = torch.rand_like(x) < flip_prob + else: + raise NotImplementedError + x = x.clone() + x[flip_mask] = -x[flip_mask] + return x + + def forward( + self, + x, + mask = None, + return_all_codes = False, + ): + if x.ndim == 4: + x = x.unsqueeze(2) + B, C, T, H, W = x.size() + + if self.schedule_mode.startswith("same"): + scale_num = int(self.schedule_mode[len("same"):]) + assert T == 1 + scale_schedule = [(1, H, W)] * scale_num + tower_split_index = 0 # No first frame special handling for same mode + elif self.schedule_mode.startswith("infinity_video_two_pyramid"): + scale_schedule, tower_split_index = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode, last_scale_repeat_n=self.last_scale_repeat_n) + scale_num = len(scale_schedule) + else: + scale_schedule = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode) + scale_num = len(scale_schedule) + tower_split_index = 0 # For non-infinity_video_two_pyramid modes, no first frame special handling + + if self.training and self.random_short_schedule and random.random() < self.short_schedule_prob: + if self.schedule_mode.startswith("infinity_video_two_pyramid"): + if T == 1: + scale_num = self.full2short_f8[scale_num] + tower_split_index = scale_num + else: + pass + else: + if self.schedule_mode.startswith("dense_f8"): + # print(B, C, T, H, W, scale_num, self.full2short_f8[scale_num], scale_schedule) + scale_num = self.full2short_f8[scale_num] + # print('after: \n', scale_schedule[:scale_num]) + else: + scale_num = self.full2short[scale_num] + scale_schedule = scale_schedule[:scale_num] + + quantized_out = 0. + residual = x + quantized_out_firstframe = None + + all_losses = [] + all_indices = [] + all_bit_indices = [] + var_inputs = [] + residual_norm_per_scale = [] + + # go through the layers + # residual_list = [] + # interpolate_residual_list = [] + # quantized_list = [] + with autocast('cuda', enabled = False): + for si, (pt, ph, pw) in enumerate(scale_schedule): + + if si < tower_split_index: + if (pt, ph, pw) != (1, H, W): + interpolate_residual = F.interpolate(residual[:, :, :1, :, :].clone(), size=(pt, ph, pw), mode=self.z_interplote_down) + else: + interpolate_residual = residual[:, :, :1, :, :] + else: + if (pt, ph, pw) != (T-1, H, W): + if self.casual_multi_scale: + interpolate_residual = F.interpolate(residual[:, :, 1:pt+1, :, :], size=(pt, ph, pw), mode=self.z_interplote_down) + elif self.temporal_slicing: + temporal_indices = list(map(int, np.linspace(1, T-1, pt))) + assert len(temporal_indices) == pt + interpolate_residual = F.interpolate(residual[:, :, temporal_indices, :, :], size=(pt, ph, pw), mode=self.z_interplote_down) + else: + interpolate_residual = F.interpolate(residual[:, :, 1:, :, :].clone(), size=(pt, ph, pw), mode=self.z_interplote_down) + else: + interpolate_residual = residual[:, :, 1:, :, :] + if si != 0 and si != tower_split_index and self.use_stochastic_depth and random.random() < self.drop_rate: + quantized = torch.zeros_like(interpolate_residual) + else: + # Select quantizer based on scale size + if ph * pw >= self.detail_scale_min_tokens: + # Use detail quantizer for larger scales + lfq = self.lfq_detail + C1 = self.detail_scale_dim + else: + # Use semantic quantizer for smaller scales + lfq = self.lfq_semantic + C1 = self.semantic_scale_dim + + # Apply projection if needed (always needed when dimensions don't match) + # interpolate_residual shape: [B, C, T, H, W] + if C1 != self.origin_C: + # Permute to [B, T, H, W, C] for linear layer + interpolate_residual = interpolate_residual.permute(0, 2, 3, 4, 1) # [B, C, T, H, W] -> [B, T, H, W, C] + if C1 == self.semantic_scale_dim: + interpolate_residual = self.semantic_proj_down(interpolate_residual) # [B, T, H, W, C] -> [B, T, H, W, C1] + else: + interpolate_residual = self.detail_proj_down(interpolate_residual) # [B, T, H, W, C] -> [B, T, H, W, C1] + # Permute back to [B, C1, T, H, W] + interpolate_residual = interpolate_residual.permute(0, 4, 1, 2, 3) # [B, T, H, W, C1] -> [B, C1, T, H, W] + + quantized, indices, bit_indices, loss = lfq(interpolate_residual) + + # Apply inverse projection if needed (always needed when dimensions don't match) + # quantized shape: [B, C1, T, H, W] + if C1 != self.origin_C: + # Permute to [B, T, H, W, C1] for linear layer + quantized = quantized.permute(0, 2, 3, 4, 1) # [B, C1, T, H, W] -> [B, T, H, W, C1] + if C1 == self.semantic_scale_dim: + quantized = self.semantic_proj_up(quantized) # [B, T, H, W, C1] -> [B, T, H, W, C] + else: + quantized = self.detail_proj_up(quantized) # [B, T, H, W, C1] -> [B, T, H, W, C] + # Permute back to [B, C, T, H, W] + quantized = quantized.permute(0, 4, 1, 2, 3) # [B, T, H, W, C] -> [B, C, T, H, W] + + all_indices.append(indices) + all_losses.append(loss) + all_bit_indices.append(bit_indices) + + # if (pt, ph, pw) != (T, H, W): + if si < tower_split_index: + if (pt, ph, pw) != (1, H, W): + quantized = F.interpolate(quantized, size=(1, H, W), mode=self.z_interplote_up).contiguous() + else: + if (pt, ph, pw) != (T-1, H, W): + quantized = F.interpolate(quantized, size=(T-1, H, W), mode=self.z_interplote_up).contiguous() + if si < tower_split_index: + residual[:, :, :1, :, :] = residual[:, :, :1, :, :] - quantized + else: + residual[:, :, 1:, :, :] = residual[:, :, 1:, :, :] - quantized + + if si < tower_split_index: + quantized_out = quantized_out + quantized + if si == tower_split_index - 1: + quantized_out_firstframe = quantized_out.clone() + quantized_out = 0 + else: + quantized_out = quantized_out + quantized + + if quantized_out_firstframe is not None: + if len(scale_schedule) == tower_split_index: + quantized_out = quantized_out_firstframe + else: + quantized_out = torch.cat([quantized_out_firstframe, quantized_out], dim=2) + + # print("residual_list:", residual_list) + # print("interpolate_residual_list:", interpolate_residual_list) + # print("quantized_list:", quantized_list) + # import ipdb; ipdb.set_trace() + # project out, if needed + + # stack all losses and indices + + all_losses = torch.stack(all_losses, dim = -1) + + ret = (quantized_out, all_indices, all_bit_indices, residual_norm_per_scale, all_losses, var_inputs) + + if not return_all_codes: + return ret + + # whether to return all codes from all codebooks across layers + all_codes = self.get_codes_from_indices(all_indices) + + # will return all codes in shape (quantizer, batch, sequence length, codebook dimension) + + return (*ret, all_codes) + + +class BSQ(Module): + def __init__( + self, + *, + dim = None, + entropy_loss_weight = 0.1, + commitment_loss_weight = 0.25, + num_codebooks = 1, + keep_num_codebooks_dim = None, + codebook_scale = 1., # for residual LFQ, codebook scaled down by 2x at each layer + frac_per_sample_entropy = 1., # make less than 1. to only use a random fraction of the probs for per sample entropy + soft_clamp_input_value = None, + channel_first = None, + experimental_softplus_entropy_loss = False, + entropy_loss_offset = 5., # how much to shift the loss before softplus + spherical = True, # from https://arxiv.org/abs/2406.07548 + force_quantization_f32 = True, # will force the quantization step to be full precision + inv_temperature = 100.0, + gamma0=1.0, gamma=1.0, zeta=1.0, + use_out_phi = False, # use output phi network + use_out_phi_res = False, # residual out phi + use_bernoulli = False, + use_rot_trick = False, + ): + super().__init__() + + # some assert validations + assert exists(dim) , 'dim must be specified for BSQ' + + codebook_dim = dim + codebook_dims = codebook_dim * num_codebooks + dim = default(dim, codebook_dims) + self.codebook_dims = codebook_dims + + self.out_phi = nn.Linear(codebook_dims, codebook_dims) if use_out_phi else nn.Identity() + self.use_out_phi_res = use_out_phi_res + if self.use_out_phi_res: + self.out_phi_scale = nn.Parameter(torch.zeros(codebook_dims), requires_grad=True) # init as zero + + self.dim = dim + self.codebook_dim = codebook_dim + self.num_codebooks = num_codebooks + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + # channel first + self.channel_first = channel_first + + # For BSQ (binary spherical quantization) + if not spherical: + raise ValueError("For BSQ, spherical must be True.") + self.persample_entropy_compute = 'analytical' + self.inv_temperature = inv_temperature + self.gamma0 = gamma0 # loss weight for entropy penalty + self.gamma = gamma # loss weight for entropy penalty + self.zeta = zeta # loss weight for entire entropy penalty + self.use_bernoulli = use_bernoulli + self.use_rot_trick = use_rot_trick + + # entropy aux loss related weights + + assert 0 < frac_per_sample_entropy <= 1. + self.frac_per_sample_entropy = frac_per_sample_entropy + + self.entropy_loss_weight = entropy_loss_weight + + # codebook scale + + self.codebook_scale = codebook_scale + + # commitment loss + + self.commitment_loss_weight = commitment_loss_weight + + # whether to soft clamp the input value from -value to value + + self.soft_clamp_input_value = soft_clamp_input_value + assert not exists(soft_clamp_input_value) or soft_clamp_input_value >= codebook_scale + + # whether to make the entropy loss positive through a softplus (experimental, please report if this worked or not in discussions) + + self.entropy_loss_offset = entropy_loss_offset + self.experimental_softplus_entropy_loss = experimental_softplus_entropy_loss + + # for no auxiliary loss, during inference + + self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1)) + self.register_buffer('zero', torch.tensor(0.), persistent = False) + + # whether to force quantization step to be f32 + + self.force_quantization_f32 = force_quantization_f32 + + def bits_to_codes(self, bits): + return bits * self.codebook_scale * 2 - self.codebook_scale + + # @property + # def dtype(self): + # return self.codebook.dtype + + def indices_to_codes( + self, + indices, + label_type = 'int_label', + project_out = True + ): + assert label_type in ['int_label', 'bit_label'] + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + should_transpose = default(self.channel_first, is_img_or_video) + + if not self.keep_num_codebooks_dim: + if label_type == 'int_label': + indices = rearrange(indices, '... -> ... 1') + else: + indices = indices.unsqueeze(-2) + + # indices to codes, which are bits of either -1 or 1 + + if label_type == 'int_label': + assert indices[..., None].int().min() > 0 + bits = ((indices[..., None].int() & self.mask) != 0).float() # .to(self.dtype) + else: + bits = indices + + codes = self.bits_to_codes(bits).float() + + codes = l2norm(codes) # must normalize when using BSQ + + codes = rearrange(codes, '... c d -> ... (c d)') + + # whether to project codes out to original dimensions + # if the input feature dimensions were not log2(codebook size) + + # rearrange codes back to original shape + + if should_transpose: + codes = rearrange(codes, 'b ... d -> b d ...') + + return codes + + def quantize(self, z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def quantize_new_bernoulli(self, z, prob_z): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + + zhat = (torch.bernoulli(prob_z) - 0.5) * 2.0 + + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = q_scale * zhat # on unit sphere + + return z + (zhat - z).detach() + + def rot_quantize(self, z, inference=False): + assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}" + q_scale = 1. / (self.codebook_dims ** 0.5) + zhat = torch.where(z > 0, + torch.tensor(1, dtype=z.dtype, device=z.device), + torch.tensor(-1, dtype=z.dtype, device=z.device)) * q_scale + if inference: + return zhat + + w = ((z + zhat) / torch.norm(z + zhat, dim=-1, keepdim=True)).detach() + z = z.unsqueeze(1) - 2*torch.bmm(torch.bmm(z.unsqueeze(1), w.unsqueeze(-1)), w.unsqueeze(1)) + 2 * torch.bmm( + torch.bmm(z.unsqueeze(1), z.unsqueeze(-1).detach()), zhat.unsqueeze(1).detach()) + return z.squeeze() + + def soft_entropy_loss(self, z): + if self.persample_entropy_compute == 'analytical': + # if self.l2_norm: + p = torch.sigmoid(-4 * z / (self.codebook_dims ** 0.5) * self.inv_temperature) + # else: + # p = torch.sigmoid(-4 * z * self.inv_temperature) + prob = torch.stack([p, 1-p], dim=-1) # (b, h, w, 18, 2) + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() # (b,h,w,18)->(b,h,w)->scalar + else: + per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() + + # macro average of the probability of each subgroup + avg_prob = reduce(prob, '... g d ->g d', 'mean') # (18, 2) + codebook_entropy = self.get_entropy(avg_prob, dim=-1, normalize=False) + + # the approximation of the entropy is the sum of the entropy of each subgroup + return per_sample_entropy, codebook_entropy.sum(), avg_prob + + def get_entropy(self, count, dim=-1, eps=1e-4, normalize=True): + if normalize: # False + probs = (count + eps) / (count + eps).sum(dim=dim, keepdim =True) + else: # True + probs = count + H = -(probs * torch.log(probs + 1e-8)).sum(dim=dim) + return H + + def forward( + self, + x, + return_loss_breakdown = False, + mask = None, + entropy_weight=0.1 + ): + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension, which is also log2(codebook size) + c - number of codebook dim + """ + + is_img_or_video = x.ndim >= 4 + should_transpose = default(self.channel_first, is_img_or_video) + + # standardize image or video into (batch, seq, dimension) + + if should_transpose: + x = rearrange(x, 'b d ... -> b ... d') + x, ps = pack_one(x, 'b * d') # x.shape [b, hwt, c] + + assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}' + + # split out number of codebooks + + x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks) + + if self.use_bernoulli: + prob_x = torch.sigmoid(x) + + x = l2norm(x) + + # whether to force quantization step to be full precision or not + + force_f32 = self.force_quantization_f32 + + quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext + + with quantization_context(): + + if force_f32: + orig_dtype = x.dtype + x = x.float() + + # use straight-through gradients + if self.use_rot_trick: + x_f = x.flatten(end_dim=-2) # (b, hwt, 1, d) -> (bhwt, d) + q_f = self.rot_quantize(x_f, inference= not self.training) + quantized = q_f.reshape(x.shape) + elif self.use_bernoulli: + quantized = self.quantize_new_bernoulli(x, prob_x) + else: + quantized = self.quantize(x) + + # calculate indices + indices = reduce((quantized > 0).int() * self.mask.int(), 'b n c d -> b n c', 'sum') + bit_indices = (quantized > 0).int() + + # entropy aux loss + if self.training: + persample_entropy, cb_entropy, avg_prob = self.soft_entropy_loss(x) # compute entropy + entropy_penalty = self.gamma0 * persample_entropy - self.gamma * cb_entropy + else: + # if not training, just return dummy 0 + entropy_penalty = persample_entropy = cb_entropy = self.zero + + # commit loss + + if self.training and self.commitment_loss_weight > 0.: + + commit_loss = F.mse_loss(x, quantized.detach(), reduction = 'none') + + if exists(mask): + commit_loss = commit_loss[mask] + + commit_loss = commit_loss.mean() + else: + commit_loss = self.zero + + # input back to original dtype if needed + + if force_f32: + x = x.type(orig_dtype) + + # merge back codebook dim + x = quantized # rename quantized to x for output + + if self.use_out_phi_res: + x = x + self.out_phi_scale * self.out_phi(x) # apply out_phi on quant output as residual + else: + x = self.out_phi(x) # apply out_phi on quant output + + x = rearrange(x, 'b n c d -> b n (c d)') + + # reconstitute image or video dimensions + + if should_transpose: + x = unpack_one(x, ps, 'b * d') + x = rearrange(x, 'b ... d -> b d ...') + + bit_indices = unpack_one(bit_indices, ps, 'b * c d') + + # whether to remove single codebook dim + + if not self.keep_num_codebooks_dim: + bit_indices = rearrange(bit_indices, '... 1 d -> ... d') + + # complete aux loss + + aux_loss = commit_loss * self.commitment_loss_weight + (self.zeta * entropy_penalty / self.inv_temperature)*entropy_weight + # returns + + ret = Return(x, indices, bit_indices, aux_loss) + + if not return_loss_breakdown: + return ret + + return ret, LossBreakdown(persample_entropy, cb_entropy, commit_loss) + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_fsq_tp.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_fsq_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..56635deea64d1223c601d310844fe1db3dc440c6 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/multiscale_fsq_tp.py @@ -0,0 +1,532 @@ +""" +Binary Spherical Quantization +Proposed in https://arxiv.org/abs/2406.07548 + +In the simplest setup, each dimension is quantized into {-1, 1}. +An entropy penalty is used to encourage utilization. +""" + +import random +import copy +from math import log2, ceil +from functools import partial, cache +from collections import namedtuple +from contextlib import nullcontext + +import torch.distributed as dist +from torch.distributed import nn as dist_nn + +import torch +from torch import nn, einsum +import torch.nn.functional as F +from torch.nn import Module +from torch.amp import autocast +import numpy as np + +from einops import rearrange, reduce, pack, unpack + +# from einx import get_at + +from infinity.models.videovae.utils.dynamic_resolution import predefined_HW_Scales_dynamic +from infinity.models.videovae.utils.dynamic_resolution_two_pyramid import dynamic_resolution_thw, total_pixels2scales +from infinity.models.videovae.modules.quantizer.finite_scalar_quantization import FSQ +# print(f"{dynamic_resolution_thw=}") + +# constants + +Return = namedtuple('Return', ['quantized', 'indices', 'entropy_aux_loss']) + +LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment']) + +# distributed helpers + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + + dist_nn.all_reduce(t) + t = t / dist.get_world_size() + return t + +# helper functions + +def exists(v): + return v is not None + +def identity(t): + return t + +def default(*args): + for arg in args: + if exists(arg): + return arg() if callable(arg) else arg + return None + +def round_up_multiple(num, mult): + return ceil(num / mult) * mult + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +def l2norm(t): + return F.normalize(t, dim = -1) + +# entropy + +def log(t, eps = 1e-5): + return t.clamp(min = eps).log() + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + +# cosine sim linear + +class CosineSimLinear(Module): + def __init__( + self, + dim_in, + dim_out, + scale = 1. + ): + super().__init__() + self.scale = scale + self.weight = nn.Parameter(torch.randn(dim_in, dim_out)) + + def forward(self, x): + x = F.normalize(x, dim = -1) + w = F.normalize(self.weight, dim = 0) + return (x @ w) * self.scale + +def repeat_schedule(scale_schedule, repeat_scales_num, times): + new_scale_schedule = [] + for i in range(repeat_scales_num): + new_scale_schedule.extend([scale_schedule[i] for _ in range(times)]) + new_scale_schedule.extend(scale_schedule[repeat_scales_num:]) + return new_scale_schedule + + +def get_latent2scale_schedule(T: int, H: int, W: int, mode="original", last_scale_repeat_n=0): + predefined_HW_Scales = {} + if mode.startswith("infinity_video_two_pyramid"): + if 'elegant' in mode: + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + image_scale_repetition = [5, 5, 5, 5, 5, 5, 5, 5, 4, 3, 2] + [1] * 10 + video_scale_repetition = [5, 5, 5, 5, 5, 5, 5, 5, 4, 3, 2] + [1] * 10 + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + def repeat_scales(base_scale_schedule, scale_repetition): + scale_schedule = [] + for i in range(len(base_scale_schedule)): + scale_schedule.extend([base_scale_schedule[i] for _ in range(scale_repetition[i])]) + return scale_schedule + image_scale_schedule = repeat_scales(base_scale_schedule, image_scale_repetition) + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + firstframe_scalecnt = len(image_scale_schedule) + if T > 1: + scale_schedule = repeat_scales(base_scale_schedule, video_scale_repetition) + spatial_time_schedule.extend([(T-1, h, w) for i, (_, h, w) in enumerate(scale_schedule)]) + # double h and w + tower_split_index = firstframe_scalecnt + # print(f'{spatial_time_schedule=}') + return spatial_time_schedule, tower_split_index + if "motion_boost_v2" in mode: + times = 6 + base_scale_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + image_scale_schedule = repeat_schedule(base_scale_schedule, 3, times) + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + firstframe_scalecnt = len(image_scale_schedule) + if T > 1: + scale_schedule = repeat_schedule(base_scale_schedule, 7, times) + predefined_t = [T - 1 for _ in range(len(scale_schedule))] + spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), T - 1), h, w) for i, (_, h, w) in enumerate(scale_schedule)]) + # double h and w + spatial_time_schedule_double = [(t, 2*h, 2*w) for (t, h, w) in spatial_time_schedule] + tower_split_index = firstframe_scalecnt + return spatial_time_schedule_double, tower_split_index + spatial_time_schedule = copy.deepcopy(dynamic_resolution_thw[(H, W)]['scales']) + spatial_time_schedule.extend(spatial_time_schedule[-1:] * last_scale_repeat_n) + tower_split_index = dynamic_resolution_thw[(H, W)]['tower_split_index'] + last_scale_repeat_n + if T > 1: + # predefined_t = np.linspace(1, compressed_frames - 1, len(scale_schedule)) + if mode == "infinity_video_two_pyramid_full_time": + spatial_time_schedule.extend([(T - 1, h, w) for i, (_, h, w) in enumerate(spatial_time_schedule)]) + else: + predefined_t = np.linspace(1, T - 1, total_pixels2scales['0.06M']-3).tolist() + [T - 1] * (len(spatial_time_schedule)-total_pixels2scales['0.06M']+3) + spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), T - 1), h, w) for i, (_, h, w) in enumerate(spatial_time_schedule)]) + spatial_time_schedule.extend(spatial_time_schedule[-1:] * last_scale_repeat_n) + # double h and w + spatial_time_schedule_double = [(t, 2*h, 2*w) for (t, h, w) in spatial_time_schedule] + return spatial_time_schedule_double, tower_split_index + if mode == "original": + predefined_HW_Scales = { + # 256x256 + (16, 16): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)], + (36, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 12), (13, 16), (18, 24), (24, 32), (32, 48), (36, 64)], + (18, 32): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 8), (8, 10), (10, 14), (12, 18), (14, 22), (16, 26), (18, 32)], + (30, 53): [(1, 1), (2, 2), (3, 3), (4, 7), (6, 11), (8, 14), (12, 21), (16, 28), (20, 35), (22, 39), (24, 42), (26, 46), (28, 50), (30, 53)] + } + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (64, 64)] + elif mode == "dynamic": + predefined_HW_Scales.update(predefined_HW_Scales_dynamic) + elif mode == "dense": + predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + elif mode == "dense_f8": + # predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(32, 32)] = [(x, x) for x in range(1, 16+1)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + predefined_HW_Scales[(128, 128)] = predefined_HW_Scales[(64, 64)] + [(80, 80), (96, 96), (112, 112), (128, 128)] + elif mode == "dense_f8_double": + # predefined_HW_Scales setting double from dense f16 + predefined_HW_Scales[(32, 32)] = [(x, x) for x in range(1, 16+1)] + predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(20, 20), (24, 24), (28, 28), (32, 32)] + predefined_HW_Scales[(96, 96)] = predefined_HW_Scales[(64, 64)] + [(40, 40), (48, 48)] + predefined_HW_Scales[(128, 128)] = predefined_HW_Scales[(64, 64)] + [(40, 40), (48, 48), (56, 56), (64, 64)] + + predefined_HW_Scales[(24, 42)] = [(1, 1), (2, 2), (3, 3), (3, 4), (3, 5), (4, 6), (4, 7), (5, 8), (6, 9), (6, 10), (6, 11), (7, 12), (7, 13), (8, 14), (9, 15), (9, 16), (12, 21)] + predefined_HW_Scales[(36, 64)] = predefined_HW_Scales[(24, 42)] + [(14, 26), (18, 32)] + predefined_HW_Scales[(60, 108)] = predefined_HW_Scales[(36, 64)] + [(24, 42), (30, 54)] + predefined_HW_Scales[(90, 160)] = predefined_HW_Scales[(60, 108)] + [(38, 66),(45, 80)] + + for k, v in predefined_HW_Scales.items(): + predefined_HW_Scales[k] = [(2*x, 2*y) for (x, y) in v] + elif mode.startswith("same"): + num_quant = int(mode[len("same"):]) + predefined_HW_Scales[(16, 16)] = [(16, 16) for _ in range(num_quant)] + predefined_HW_Scales[(32, 32)] = [(32, 32) for _ in range(num_quant)] + predefined_HW_Scales[(64, 64)] = [(64, 64) for _ in range(num_quant)] + elif mode == "half": + predefined_HW_Scales[(32, 32)] = [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)] + predefined_HW_Scales[(64, 64)] = [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16)] + else: + raise NotImplementedError + + # predefined_T_Scales = [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 17, 17, 17, 17, 17, 17] + # predefined_T_Scales = [1, 2, 3, 4, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27] + predefined_T_Scales = [1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29] + # predefined_T_Scales = [1, 2, 3, 5, 6, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] + patch_THW_shape_per_scale = predefined_HW_Scales[(H, W)] + if len(predefined_T_Scales) < len(patch_THW_shape_per_scale): + # print("warning: the length of predefined_T_Scales is less than the length of patch_THW_shape_per_scale!") + predefined_T_Scales += [predefined_T_Scales[-1]] * (len(patch_THW_shape_per_scale) - len(predefined_T_Scales)) + patch_THW_shape_per_scale = [(min(T, t), h, w ) for (h, w), t in zip(patch_THW_shape_per_scale, predefined_T_Scales[:len(patch_THW_shape_per_scale)])] + return patch_THW_shape_per_scale + +# TP: Two Pyramid +class MultiScaleFSQTP(Module): + """ Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """ + + def __init__( + self, + *, + dim, + soft_clamp_input_value = None, + aux_loss = False, # intermediate auxiliary loss + use_stochastic_depth=False, + drop_rate=0., + schedule_mode="original", # ["original", "dynamic", "dense"] + keep_first_quant=False, + keep_last_quant=False, + remove_residual_detach=False, + random_flip = False, + flip_prob = 0.5, + flip_mode = "stochastic", # "stochastic", "deterministic" + max_flip_lvl = 1, + random_flip_1lvl = False, # random flip one level each time + flip_lvl_idx = None, + drop_when_test=False, + drop_lvl_idx=None, + drop_lvl_num=0, + random_short_schedule = False, # randomly use short schedule (schedule for images of 256x256) + short_schedule_prob = 0.5, + disable_flip_prob = 0.0, # disable random flip in this image + casual_multi_scale = False, # causal multiscale + temporal_slicing = False, + last_scale_repeat_n = 0, + num_lvl_fsq = None, + other_args = None, + **kwargs + ): + super().__init__() + codebook_dim = dim + self.use_stochastic_depth = use_stochastic_depth + self.drop_rate = drop_rate + self.remove_residual_detach = remove_residual_detach + self.random_flip = random_flip + self.flip_prob = flip_prob + self.flip_mode = flip_mode + self.max_flip_lvl = max_flip_lvl + self.random_flip_1lvl = random_flip_1lvl + self.flip_lvl_idx = flip_lvl_idx + assert (random_flip and random_flip_1lvl) == False + self.disable_flip_prob = disable_flip_prob + self.casual_multi_scale = casual_multi_scale + self.temporal_slicing = temporal_slicing + self.last_scale_repeat_n = last_scale_repeat_n + # print(f"{casual_multi_scale=}") + + self.drop_when_test = drop_when_test + self.drop_lvl_idx = drop_lvl_idx + self.drop_lvl_num = drop_lvl_num + if self.drop_when_test: + assert drop_lvl_idx is not None + assert drop_lvl_num > 0 + self.random_short_schedule = random_short_schedule + self.short_schedule_prob = short_schedule_prob + self.z_interplote_up = 'trilinear' + self.z_interplote_down = 'area' + + self.schedule_mode = schedule_mode + self.keep_first_quant = keep_first_quant + self.keep_last_quant = keep_last_quant + if self.use_stochastic_depth and self.drop_rate > 0: + assert self.keep_first_quant or self.keep_last_quant + + self.full2short = {7:7, 10:7, 13:7, 16:16, 20:16, 24:16} + if self.schedule_mode == 'dense_f8': + self.full2short_f8 = {20:20, 24:24, 28:24} + elif self.schedule_mode == 'dense_f8_double': + self.full2short_f8 = {16: 14, 17: 14, 19: 14, 20:14, 21:14, 22:14, 24:14} + elif self.schedule_mode.startswith("infinity_video_two_pyramid"): + self.full2short_f8 = {11: 11, 13: 11, 14: 11, 16: 11} + + self.other_args = other_args + print(f'{self.other_args=}') + self.origin_C = 64 + self.detail_scale_dim, self.semantic_scale_dim = self.other_args.detail_scale_dim, self.other_args.semantic_scale_dim + self.lfq_semantic = FSQ( + dim = self.semantic_scale_dim, + num_lvl = num_lvl_fsq, + ) + self.lfq_detail = FSQ( + dim = self.detail_scale_dim, + num_lvl = num_lvl_fsq, + ) + + self.detail_scale_min_tokens = 80 # include + middle_hidden_dim=64 + if self.other_args.use_learnable_dim_proj: + self.semantic_proj_down = nn.Sequential( + nn.Linear(self.origin_C, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.semantic_scale_dim), + ) + self.semantic_proj_up = nn.Sequential( + nn.Linear(self.semantic_scale_dim, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.origin_C), + ) + # assert self.detail_scale_dim >= self.origin_C + if self.detail_scale_dim == self.origin_C: + self.detail_proj_up, self.detail_proj_down = nn.Identity(), nn.Identity() + elif self.detail_scale_dim > self.origin_C: + self.detail_proj_up = nn.Sequential( + nn.Linear(self.origin_C, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.detail_scale_dim), + ) + self.detail_proj_down = nn.Sequential( + nn.Linear(self.detail_scale_dim, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.origin_C), + ) + else: + self.detail_proj_down = nn.Sequential( + nn.Linear(self.origin_C, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.detail_scale_dim), + ) + self.detail_proj_up = nn.Sequential( + nn.Linear(self.detail_scale_dim, middle_hidden_dim), + nn.SiLU(), + nn.Linear(middle_hidden_dim, self.origin_C), + ) + + @property + def codebooks(self): + return self.lfq.codebook + + def get_codes_from_indices(self, indices_list): + all_codes = [] + for indices in indices_list: + codes = self.lfq.indices_to_codes(indices) + all_codes.append(codes) + _, _, T, H, W = all_codes[-1].size() + summed_codes = 0 + for code in all_codes: + summed_codes += F.interpolate(code, size=(T, H, W), mode=self.z_interplote_up) + return summed_codes + + def get_output_from_indices(self, indices): + codes = self.get_codes_from_indices(indices) + codes_summed = reduce(codes, 'q ... -> ...', 'sum') + return codes_summed + + def flip_quant(self, x): + # assert self.flip_mode in ['stochastic', 'stochastic_dynamic'] + if self.flip_mode == 'stochastic': + flip_mask = torch.rand_like(x) < self.flip_prob + elif self.flip_mode == 'stochastic_dynamic': + flip_prob = random.uniform(0, self.flip_prob) + flip_mask = torch.rand_like(x) < flip_prob + else: + raise NotImplementedError + x = x.clone() + x[flip_mask] = -x[flip_mask] + return x + + def forward( + self, + x, + mask = None, + return_all_codes = False, + double = False + ): + if x.ndim == 4: + x = x.unsqueeze(2) + B, C, T, H, W = x.size() + if self.schedule_mode.startswith("same"): + scale_num = int(self.schedule_mode[len("same"):]) + assert T == 1 + scale_schedule = [(1, H, W)] * scale_num + elif self.schedule_mode.startswith("infinity_video_two_pyramid") or self.schedule_mode == "last_only_two_pyramid": + if double: + scale_schedule, tower_split_index = get_latent2scale_schedule(T, H*2, W*2, mode=self.schedule_mode, last_scale_repeat_n=self.last_scale_repeat_n) + scale_schedule = [(t, h//2, w//2) for (t, h, w) in scale_schedule] + scale_num = len(scale_schedule) + else: + scale_schedule, tower_split_index = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode, last_scale_repeat_n=self.last_scale_repeat_n) + scale_num = len(scale_schedule) + else: + scale_schedule = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode) + scale_num = len(scale_schedule) + + if self.training and self.random_short_schedule and random.random() < self.short_schedule_prob: + if self.schedule_mode.startswith("infinity_video_two_pyramid"): + if T == 1: + scale_num = self.full2short_f8[scale_num] + tower_split_index = scale_num + else: + pass + else: + if self.schedule_mode.startswith("dense_f8"): + # print(B, C, T, H, W, scale_num, self.full2short_f8[scale_num], scale_schedule) + scale_num = self.full2short_f8[scale_num] + # print('after: \n', scale_schedule[:scale_num]) + else: + scale_num = self.full2short[scale_num] + scale_schedule = scale_schedule[:scale_num] + + quantized_out = 0. + residual = x + quantized_out_firstframe = None + + all_losses = [] + all_indices = [] + + # go through the layers + # residual_list = [] + # interpolate_residual_list = [] + # quantized_list = [] + with autocast('cuda', enabled = False): + for si, (pt, ph, pw) in enumerate(scale_schedule): + if si < tower_split_index: + tgt_shape = (self.origin_C, 1, H, W) + ss, ee = 0, 1 + else: + tgt_shape = (self.origin_C, T-1, H, W) + ss, ee = 1, T + is_semantic_scale = True + if ph * pw >= self.detail_scale_min_tokens: + is_semantic_scale = False + C1 = self.detail_scale_dim + lfq = self.lfq_detail + else: + C1 = self.semantic_scale_dim + lfq = self.lfq_semantic + + def interpolate(tensor, size, mode, quantizer, is_semantic_scale): + """ + arguments: + tensor: (B,C,T,H,W) + size: (C1,T,H1,W1) + mode: str + quantizer: quantizer + is_semantic_scale: bool + return: + tensor: (B,*size) + """ + B, C, T, H, W = tensor.shape + C1, T, H1, W1 = size + if quantizer.other_args.use_learnable_dim_proj: + if is_semantic_scale: + if C > C1: + proj = self.semantic_proj_down + elif C < C1: + proj = self.semantic_proj_up + else: + if C > C1: + proj = self.detail_proj_down + elif C < C1: + proj = self.detail_proj_up + if C != C1: + tensor = tensor.permute(0,2,3,4,1) # (B,C,T,H,W) -> (B,T,H,W,C) + tensor = proj(tensor) # (B,T,H,W,C1) + tensor = tensor.permute(0,4,1,2,3) # (B,T,H,W,C1) -> (B,C1,T,H,W) + tensor = F.interpolate(tensor, size=(T, H1, W1), mode=mode) # (B,C1,T,H,W) -> (B,C1,T,H1,W1) + return tensor + else: + tensor = tensor.permute(0,2,1,3,4) # (B,C,T,H,W) -> (B,T,C,H,W) + tensor = F.interpolate(tensor, size=(C1, H1, W1), mode=mode) + tensor = tensor.permute(0,2,1,3,4) # (B,T,C1,H1,W1) -> (B,C1,T,H1,W1) + return tensor + + if ph * pw < 16*16: # 192p drop + skip_detail_scales = False + else: + if random.random() < self.other_args.skip_detail_scales_prob: + skip_detail_scales = True + + if (not skip_detail_scales): + interpolate_residual = interpolate(residual[:, :, ss:ee, :, :].clone(), size=(C1, pt, ph, pw), mode=self.z_interplote_down, quantizer=self, is_semantic_scale=is_semantic_scale) + quantized, indices = lfq(interpolate_residual) + quantized = interpolate(quantized, size=tgt_shape, mode=self.z_interplote_up, quantizer=self, is_semantic_scale=is_semantic_scale) + all_indices.append(indices) + # all_losses.append(loss) + residual[:, :, ss:ee, :, :] = residual[:, :, ss:ee, :, :] - quantized + quantized_out = quantized_out + quantized + if si == tower_split_index - 1: + quantized_out_firstframe = quantized_out.clone() + quantized_out = 0 + + if quantized_out_firstframe is not None: + if len(scale_schedule) == tower_split_index: + quantized_out = quantized_out_firstframe + else: + quantized_out = torch.cat([quantized_out_firstframe, quantized_out], dim=2) + + # stack all losses and indices + + all_losses = None + + ret = (quantized_out, all_indices, all_losses) + + if not return_all_codes: + return ret + + # whether to return all codes from all codebooks across layers + all_codes = self.get_codes_from_indices(all_indices) + + # will return all codes in shape (quantizer, batch, sequence length, codebook dimension) + + return (*ret, all_codes) diff --git a/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/vector_quantize_pytorch.py b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/vector_quantize_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..b8716c6c761e20ddec629f02504b2ca5d6d05348 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/modules/quantizer/vector_quantize_pytorch.py @@ -0,0 +1,1110 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +from functools import partial + +import torch +from torch import nn, einsum +import torch.nn.functional as F +import torch.distributed as distributed +from torch.optim import Optimizer +from torch.amp import autocast + +from einops import rearrange, repeat, reduce, pack, unpack + +from typing import Callable + +def exists(val): + return val is not None + +def default(val, d): + return val if exists(val) else d + +def noop(*args, **kwargs): + pass + +def identity(t): + return t + +def l2norm(t): + return F.normalize(t, p = 2, dim = -1) + +def cdist(x, y): + x2 = reduce(x ** 2, 'b n d -> b n', 'sum') + y2 = reduce(y ** 2, 'b n d -> b n', 'sum') + xy = einsum('b i d, b j d -> b i j', x, y) * -2 + return (rearrange(x2, 'b i -> b i 1') + rearrange(y2, 'b j -> b 1 j') + xy).clamp(min = 0).sqrt() + +def log(t, eps = 1e-20): + return torch.log(t.clamp(min = eps)) + +def ema_inplace(old, new, decay): + is_mps = str(old.device).startswith('mps:') + + if not is_mps: + old.lerp_(new, 1 - decay) + else: + old.mul_(decay).add_(new * (1 - decay)) + +def pack_one(t, pattern): + return pack([t], pattern) + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + +def uniform_init(*shape): + t = torch.empty(shape) + nn.init.kaiming_uniform_(t) + return t + +def gumbel_noise(t): + noise = torch.zeros_like(t).uniform_(0, 1) + return -log(-log(noise)) + +def gumbel_sample( + logits, + temperature = 1., + stochastic = False, + straight_through = False, + reinmax = False, + dim = -1, + training = True +): + dtype, size = logits.dtype, logits.shape[dim] + + if training and stochastic and temperature > 0: + sampling_logits = (logits / temperature) + gumbel_noise(logits) + else: + sampling_logits = logits + + ind = sampling_logits.argmax(dim = dim) + one_hot = F.one_hot(ind, size).type(dtype) + + assert not (reinmax and not straight_through), 'reinmax can only be turned on if using straight through gumbel softmax' + + if not straight_through or temperature <= 0. or not training: + return ind, one_hot + + # use reinmax for better second-order accuracy - https://arxiv.org/abs/2304.08612 + # algorithm 2 + + if reinmax: + π0 = logits.softmax(dim = dim) + π1 = (one_hot + (logits / temperature).softmax(dim = dim)) / 2 + π1 = ((log(π1) - logits).detach() + logits).softmax(dim = 1) + π2 = 2 * π1 - 0.5 * π0 + one_hot = π2 - π2.detach() + one_hot + else: + π1 = (logits / temperature).softmax(dim = dim) + one_hot = one_hot + π1 - π1.detach() + + return ind, one_hot + +def laplace_smoothing(x, n_categories, eps = 1e-5, dim = -1): + denom = x.sum(dim = dim, keepdim = True) + return (x + eps) / (denom + n_categories * eps) + +def sample_vectors(samples, num): + num_samples, device = samples.shape[0], samples.device + if num_samples >= num: + indices = torch.randperm(num_samples, device = device)[:num] + else: + indices = torch.randint(0, num_samples, (num,), device = device) + + return samples[indices] + +def batched_sample_vectors(samples, num): + return torch.stack([sample_vectors(sample, num) for sample in samples.unbind(dim = 0)], dim = 0) + +def pad_shape(shape, size, dim = 0): + return [size if i == dim else s for i, s in enumerate(shape)] + +def sample_multinomial(total_count, probs): + device = probs.device + probs = probs.cpu() + + total_count = probs.new_full((), total_count) + remainder = probs.new_ones(()) + sample = torch.empty_like(probs, dtype = torch.long) + + for i, p in enumerate(probs): + s = torch.binomial(total_count, p / remainder) + sample[i] = s + total_count -= s + remainder -= p + + return sample.to(device) + +def all_gather_sizes(x, dim): + size = torch.tensor(x.shape[dim], dtype = torch.long, device = x.device) + all_sizes = [torch.empty_like(size) for _ in range(distributed.get_world_size())] + distributed.all_gather(all_sizes, size) + return torch.stack(all_sizes) + +def all_gather_variably_sized(x, sizes, dim = 0): + rank = distributed.get_rank() + all_x = [] + + for i, size in enumerate(sizes): + t = x if i == rank else x.new_empty(pad_shape(x.shape, size, dim)) + distributed.broadcast(t, src = i, async_op = True) + all_x.append(t) + + distributed.barrier() + return all_x + +def sample_vectors_distributed(local_samples, num): + local_samples = rearrange(local_samples, '1 ... -> ...') + + rank = distributed.get_rank() + all_num_samples = all_gather_sizes(local_samples, dim = 0) + + if rank == 0: + samples_per_rank = sample_multinomial(num, all_num_samples / all_num_samples.sum()) + else: + samples_per_rank = torch.empty_like(all_num_samples) + + distributed.broadcast(samples_per_rank, src = 0) + samples_per_rank = samples_per_rank.tolist() + + local_samples = sample_vectors(local_samples, samples_per_rank[rank]) + all_samples = all_gather_variably_sized(local_samples, samples_per_rank, dim = 0) + out = torch.cat(all_samples, dim = 0) + + return rearrange(out, '... -> 1 ...') + +def batched_bincount(x, *, minlength): + batch, dtype, device = x.shape[0], x.dtype, x.device + target = torch.zeros(batch, minlength, dtype = dtype, device = device) + values = torch.ones_like(x) + target.scatter_add_(-1, x, values) + return target + +def kmeans( + samples, + num_clusters, + num_iters = 10, + use_cosine_sim = False, + sample_fn = batched_sample_vectors, + all_reduce_fn = noop +): + num_codebooks, dim, dtype, device = samples.shape[0], samples.shape[-1], samples.dtype, samples.device + + means = sample_fn(samples, num_clusters) + + for _ in range(num_iters): + if use_cosine_sim: + dists = samples @ rearrange(means, 'h n d -> h d n') + else: + dists = -cdist(samples, means) + + buckets = torch.argmax(dists, dim = -1) + bins = batched_bincount(buckets, minlength = num_clusters) + all_reduce_fn(bins) + + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_codebooks, num_clusters, dim, dtype = dtype) + + new_means.scatter_add_(1, repeat(buckets, 'h n -> h n d', d = dim), samples) + new_means = new_means / rearrange(bins_min_clamped, '... -> ... 1') + all_reduce_fn(new_means) + + if use_cosine_sim: + new_means = l2norm(new_means) + + means = torch.where( + rearrange(zero_mask, '... -> ... 1'), + means, + new_means + ) + + return means, bins + +def batched_embedding(indices, embeds): + batch, dim = indices.shape[1], embeds.shape[-1] + indices = repeat(indices, 'h b n -> h b n d', d = dim) + embeds = repeat(embeds, 'h c d -> h b c d', b = batch) + return embeds.gather(2, indices) + +# regularization losses + +def orthogonal_loss_fn(t): + # eq (2) from https://arxiv.org/abs/2112.00384 + h, n = t.shape[:2] + normed_codes = l2norm(t) + cosine_sim = einsum('h i d, h j d -> h i j', normed_codes, normed_codes) + return (cosine_sim ** 2).sum() / (h * n ** 2) - (1 / n) + +# distance types + +class EuclideanCodebook(nn.Module): + def __init__( + self, + dim, + codebook_size, + num_codebooks = 1, + kmeans_init = False, + kmeans_iters = 10, + sync_kmeans = True, + decay = 0.8, + eps = 1e-5, + threshold_ema_dead_code = 2, + reset_cluster_size = None, + use_ddp = False, + learnable_codebook = False, + gumbel_sample = gumbel_sample, + sample_codebook_temp = 1., + ema_update = True, + affine_param = False, + sync_affine_param = False, + affine_param_batch_decay = 0.99, + affine_param_codebook_decay = 0.9 + ): + super().__init__() + self.transform_input = identity + + self.decay = decay + self.ema_update = ema_update + + init_fn = uniform_init if not kmeans_init else torch.zeros + embed = init_fn(num_codebooks, codebook_size, dim) + + self.codebook_size = codebook_size + self.num_codebooks = num_codebooks + + self.kmeans_iters = kmeans_iters + self.eps = eps + self.threshold_ema_dead_code = threshold_ema_dead_code + self.reset_cluster_size = default(reset_cluster_size, threshold_ema_dead_code) + + assert callable(gumbel_sample) + self.gumbel_sample = gumbel_sample + self.sample_codebook_temp = sample_codebook_temp + + assert not (use_ddp and num_codebooks > 1 and kmeans_init), 'kmeans init is not compatible with multiple codebooks in distributed environment for now' + + self.sample_fn = sample_vectors_distributed if use_ddp and sync_kmeans else batched_sample_vectors + self.kmeans_all_reduce_fn = distributed.all_reduce if use_ddp and sync_kmeans else noop + self.all_reduce_fn = distributed.all_reduce if use_ddp else noop + + self.register_buffer('initted', torch.Tensor([not kmeans_init])) + self.register_buffer('cluster_size', torch.zeros(num_codebooks, codebook_size)) + self.register_buffer('embed_avg', embed.clone()) + + self.learnable_codebook = learnable_codebook + if learnable_codebook: + self.embed = nn.Parameter(embed) + else: + self.register_buffer('embed', embed) + + # affine related params + + self.affine_param = affine_param + self.sync_affine_param = sync_affine_param + + if not affine_param: + return + + self.affine_param_batch_decay = affine_param_batch_decay + self.affine_param_codebook_decay = affine_param_codebook_decay + + self.register_buffer('batch_mean', None) + self.register_buffer('batch_variance', None) + + self.register_buffer('codebook_mean_needs_init', torch.Tensor([True])) + self.register_buffer('codebook_mean', torch.empty(num_codebooks, 1, dim)) + self.register_buffer('codebook_variance_needs_init', torch.Tensor([True])) + self.register_buffer('codebook_variance', torch.empty(num_codebooks, 1, dim)) + + @torch.jit.ignore + def init_embed_(self, data, mask = None): + if self.initted: + return + + if exists(mask): + c = data.shape[0] + data = rearrange(data[mask], '(c n) d -> c n d', c = c) + + embed, cluster_size = kmeans( + data, + self.codebook_size, + self.kmeans_iters, + sample_fn = self.sample_fn, + all_reduce_fn = self.kmeans_all_reduce_fn + ) + + embed_sum = embed * rearrange(cluster_size, '... -> ... 1') + + self.embed.data.copy_(embed) + self.embed_avg.data.copy_(embed_sum) + self.cluster_size.data.copy_(cluster_size) + self.initted.data.copy_(torch.Tensor([True])) + + @torch.jit.ignore + def update_with_decay(self, buffer_name, new_value, decay): + old_value = getattr(self, buffer_name) + + needs_init = getattr(self, buffer_name + "_needs_init", False) + + if needs_init: + self.register_buffer(buffer_name + "_needs_init", torch.Tensor([False])) + + if not exists(old_value) or needs_init: + self.register_buffer(buffer_name, new_value.detach()) + + return + + value = old_value * decay + new_value.detach() * (1 - decay) + self.register_buffer(buffer_name, value) + + @torch.jit.ignore + def update_affine(self, data, embed, mask = None): + assert self.affine_param + + var_fn = partial(torch.var, unbiased = False) + + # calculate codebook mean and variance + + embed = rearrange(embed, 'h ... d -> h (...) d') + + if self.training: + self.update_with_decay('codebook_mean', reduce(embed, 'h n d -> h 1 d', 'mean'), self.affine_param_codebook_decay) + self.update_with_decay('codebook_variance', reduce(embed, 'h n d -> h 1 d', var_fn), self.affine_param_codebook_decay) + + # prepare batch data, which depends on whether it has masking + + data = rearrange(data, 'h ... d -> h (...) d') + + if exists(mask): + c = data.shape[0] + data = rearrange(data[mask], '(c n) d -> c n d', c = c) + + # calculate batch mean and variance + + if not self.sync_affine_param: + self.update_with_decay('batch_mean', reduce(data, 'h n d -> h 1 d', 'mean'), self.affine_param_batch_decay) + self.update_with_decay('batch_variance', reduce(data, 'h n d -> h 1 d', var_fn), self.affine_param_batch_decay) + return + + num_vectors, device, dtype = data.shape[-2], data.device, data.dtype + + # number of vectors, for denominator + + num_vectors = torch.tensor([num_vectors], device = device, dtype = dtype) + distributed.all_reduce(num_vectors) + + # calculate distributed mean + + batch_sum = reduce(data, 'h n d -> h 1 d', 'sum') + distributed.all_reduce(batch_sum) + batch_mean = batch_sum / num_vectors + + self.update_with_decay('batch_mean', batch_mean, self.affine_param_batch_decay) + + # calculate distributed variance + + variance_numer = reduce((data - batch_mean) ** 2, 'h n d -> h 1 d', 'sum') + distributed.all_reduce(variance_numer) + batch_variance = variance_numer / num_vectors + + self.update_with_decay('batch_variance', batch_variance, self.affine_param_batch_decay) + + def replace(self, batch_samples, batch_mask): + for ind, (samples, mask) in enumerate(zip(batch_samples.unbind(dim = 0), batch_mask.unbind(dim = 0))): + if not torch.any(mask): + continue + + sampled = self.sample_fn(rearrange(samples, '... -> 1 ...'), mask.sum().item()) + sampled = rearrange(sampled, '1 ... -> ...') + + self.embed.data[ind][mask] = sampled + + self.cluster_size.data[ind][mask] = self.reset_cluster_size + self.embed_avg.data[ind][mask] = sampled * self.reset_cluster_size + + def expire_codes_(self, batch_samples): + if self.threshold_ema_dead_code == 0: + return + + expired_codes = self.cluster_size < self.threshold_ema_dead_code + + if not torch.any(expired_codes): + return + + batch_samples = rearrange(batch_samples, 'h ... d -> h (...) d') + self.replace(batch_samples, batch_mask = expired_codes) + + @autocast("cuda", enabled = False) + def forward( + self, + x, + sample_codebook_temp = None, + mask = None, + freeze_codebook = False + ): + needs_codebook_dim = x.ndim < 4 + sample_codebook_temp = default(sample_codebook_temp, self.sample_codebook_temp) + + x = x.float() + + if needs_codebook_dim: + x = rearrange(x, '... -> 1 ...') + + dtype = x.dtype + flatten, ps = pack_one(x, 'h * d') + + if exists(mask): + mask = repeat(mask, 'b n -> c (b h n)', c = flatten.shape[0], h = flatten.shape[-2] // (mask.shape[0] * mask.shape[1])) + + self.init_embed_(flatten, mask = mask) + + if self.affine_param: + self.update_affine(flatten, self.embed, mask = mask) + + embed = self.embed if self.learnable_codebook else self.embed.detach() + + if self.affine_param: + codebook_std = self.codebook_variance.clamp(min = 1e-5).sqrt() + batch_std = self.batch_variance.clamp(min = 1e-5).sqrt() + embed = (embed - self.codebook_mean) * (batch_std / codebook_std) + self.batch_mean + + dist = -cdist(flatten, embed) + + embed_ind, embed_onehot = self.gumbel_sample(dist, dim = -1, temperature = sample_codebook_temp, training = self.training) + + embed_ind = unpack_one(embed_ind, ps, 'h *') + + if self.training: + unpacked_onehot = unpack_one(embed_onehot, ps, 'h * c') + quantize = einsum('h b n c, h c d -> h b n d', unpacked_onehot, embed) + else: + quantize = batched_embedding(embed_ind, embed) + + if self.training and self.ema_update and not freeze_codebook: + + if self.affine_param: + flatten = (flatten - self.batch_mean) * (codebook_std / batch_std) + self.codebook_mean + + if exists(mask): + embed_onehot[~mask] = 0. + + cluster_size = embed_onehot.sum(dim = 1) + + self.all_reduce_fn(cluster_size) + ema_inplace(self.cluster_size.data, cluster_size, self.decay) + + embed_sum = einsum('h n d, h n c -> h c d', flatten, embed_onehot) + embed_sum = embed_sum.contiguous() + self.all_reduce_fn(embed_sum) + + ema_inplace(self.embed_avg.data, embed_sum, self.decay) + + cluster_size = laplace_smoothing(self.cluster_size, self.codebook_size, self.eps) * self.cluster_size.sum(dim = -1, keepdim = True) + + embed_normalized = self.embed_avg / rearrange(cluster_size, '... -> ... 1') + self.embed.data.copy_(embed_normalized) + self.expire_codes_(x) + + if needs_codebook_dim: + quantize, embed_ind = map(lambda t: rearrange(t, '1 ... -> ...'), (quantize, embed_ind)) + + dist = unpack_one(dist, ps, 'h * d') + + return quantize, embed_ind, dist + +class CosineSimCodebook(nn.Module): + def __init__( + self, + dim, + codebook_size, + num_codebooks = 1, + kmeans_init = False, + kmeans_iters = 10, + sync_kmeans = True, + decay = 0.8, + eps = 1e-5, + threshold_ema_dead_code = 2, + reset_cluster_size = None, + use_ddp = False, + learnable_codebook = False, + gumbel_sample = gumbel_sample, + sample_codebook_temp = 1., + ema_update = True + ): + super().__init__() + self.transform_input = l2norm + + self.ema_update = ema_update + self.decay = decay + + if not kmeans_init: + embed = l2norm(uniform_init(num_codebooks, codebook_size, dim)) + else: + embed = torch.zeros(num_codebooks, codebook_size, dim) + + self.codebook_size = codebook_size + self.num_codebooks = num_codebooks + + self.kmeans_iters = kmeans_iters + self.eps = eps + self.threshold_ema_dead_code = threshold_ema_dead_code + self.reset_cluster_size = default(reset_cluster_size, threshold_ema_dead_code) + + assert callable(gumbel_sample) + self.gumbel_sample = gumbel_sample + self.sample_codebook_temp = sample_codebook_temp + + self.sample_fn = sample_vectors_distributed if use_ddp and sync_kmeans else batched_sample_vectors + self.kmeans_all_reduce_fn = distributed.all_reduce if use_ddp and sync_kmeans else noop + self.all_reduce_fn = distributed.all_reduce if use_ddp else noop + + self.register_buffer('initted', torch.Tensor([not kmeans_init])) + self.register_buffer('cluster_size', torch.zeros(num_codebooks, codebook_size)) + self.register_buffer('embed_avg', embed.clone()) + + self.learnable_codebook = learnable_codebook + if learnable_codebook: + self.embed = nn.Parameter(embed) + else: + self.register_buffer('embed', embed) + + @torch.jit.ignore + def init_embed_(self, data, mask = None): + if self.initted: + return + + if exists(mask): + c = data.shape[0] + data = rearrange(data[mask], '(c n) d -> c n d', c = c) + + embed, cluster_size = kmeans( + data, + self.codebook_size, + self.kmeans_iters, + use_cosine_sim = True, + sample_fn = self.sample_fn, + all_reduce_fn = self.kmeans_all_reduce_fn + ) + + embed_sum = embed * rearrange(cluster_size, '... -> ... 1') + + self.embed.data.copy_(embed) + self.embed_avg.data.copy_(embed_sum) + self.cluster_size.data.copy_(cluster_size) + self.initted.data.copy_(torch.Tensor([True])) + + def replace(self, batch_samples, batch_mask): + batch_samples = l2norm(batch_samples) + + for ind, (samples, mask) in enumerate(zip(batch_samples.unbind(dim = 0), batch_mask.unbind(dim = 0))): + if not torch.any(mask): + continue + + sampled = self.sample_fn(rearrange(samples, '... -> 1 ...'), mask.sum().item()) + sampled = rearrange(sampled, '1 ... -> ...') + + self.embed.data[ind][mask] = sampled + self.embed_avg.data[ind][mask] = sampled * self.reset_cluster_size + self.cluster_size.data[ind][mask] = self.reset_cluster_size + + def expire_codes_(self, batch_samples): + if self.threshold_ema_dead_code == 0: + return + + expired_codes = self.cluster_size < self.threshold_ema_dead_code + + if not torch.any(expired_codes): + return + + batch_samples = rearrange(batch_samples, 'h ... d -> h (...) d') + self.replace(batch_samples, batch_mask = expired_codes) + + @autocast("cuda", enabled = False) + def forward( + self, + x, + sample_codebook_temp = None, + mask = None, + freeze_codebook = False + ): + needs_codebook_dim = x.ndim < 4 + sample_codebook_temp = default(sample_codebook_temp, self.sample_codebook_temp) + + x = x.float() + + if needs_codebook_dim: + x = rearrange(x, '... -> 1 ...') + + dtype = x.dtype + + flatten, ps = pack_one(x, 'h * d') + + if exists(mask): + mask = repeat(mask, 'b n -> c (b h n)', c = flatten.shape[0], h = flatten.shape[-2] // (mask.shape[0] * mask.shape[1])) + + self.init_embed_(flatten, mask = mask) + + embed = self.embed if self.learnable_codebook else self.embed.detach() + + dist = einsum('h n d, h c d -> h n c', flatten, embed) + + embed_ind, embed_onehot = self.gumbel_sample(dist, dim = -1, temperature = sample_codebook_temp, training = self.training) + embed_ind = unpack_one(embed_ind, ps, 'h *') + + if self.training: + unpacked_onehot = unpack_one(embed_onehot, ps, 'h * c') + quantize = einsum('h b n c, h c d -> h b n d', unpacked_onehot, embed) + else: + quantize = batched_embedding(embed_ind, embed) + + if self.training and self.ema_update and not freeze_codebook: + if exists(mask): + embed_onehot[~mask] = 0. + + bins = embed_onehot.sum(dim = 1) + self.all_reduce_fn(bins) + + ema_inplace(self.cluster_size.data, bins, self.decay) + + embed_sum = einsum('h n d, h n c -> h c d', flatten, embed_onehot) + embed_sum = embed_sum.contiguous() + self.all_reduce_fn(embed_sum) + + ema_inplace(self.embed_avg.data, embed_sum, self.decay) + + cluster_size = laplace_smoothing(self.cluster_size, self.codebook_size, self.eps) * self.cluster_size.sum(dim = -1, keepdim = True) + + embed_normalized = self.embed_avg / rearrange(cluster_size, '... -> ... 1') + embed_normalized = l2norm(embed_normalized) + + self.embed.data.copy_(l2norm(embed_normalized)) + self.expire_codes_(x) + + if needs_codebook_dim: + quantize, embed_ind = map(lambda t: rearrange(t, '1 ... -> ...'), (quantize, embed_ind)) + + dist = unpack_one(dist, ps, 'h * d') + return quantize, embed_ind, dist + +# main class + +class VectorQuantize(nn.Module): + def __init__( + self, + dim, + codebook_size, + codebook_dim = None, + heads = 1, + separate_codebook_per_head = False, + decay = 0.8, + eps = 1e-5, + freeze_codebook = False, + kmeans_init = False, + kmeans_iters = 10, + sync_kmeans = True, + use_cosine_sim = False, + threshold_ema_dead_code = 0, + channel_last = True, + accept_image_fmap = False, + commitment_weight = 1., + commitment_use_cross_entropy_loss = False, + orthogonal_reg_weight = 0., + orthogonal_reg_active_codes_only = False, + orthogonal_reg_max_codes = None, + stochastic_sample_codes = False, + sample_codebook_temp = 1., + straight_through = False, + reinmax = False, # using reinmax for improved straight-through, assuming straight through helps at all + sync_codebook = None, + sync_affine_param = False, + ema_update = True, + learnable_codebook = False, + in_place_codebook_optimizer: Callable[..., Optimizer] = None, # Optimizer used to update the codebook embedding if using learnable_codebook + affine_param = False, + affine_param_batch_decay = 0.99, + affine_param_codebook_decay = 0.9, + sync_update_v = 0. # the v that controls optimistic vs pessimistic update for synchronous update rule (21) https://minyoungg.github.io/vqtorch/assets/draft_050523.pdf + ): + super().__init__() + self.dim = dim + self.heads = heads + self.separate_codebook_per_head = separate_codebook_per_head + + codebook_dim = default(codebook_dim, dim) + codebook_input_dim = codebook_dim * heads + + requires_projection = codebook_input_dim != dim + self.project_in = nn.Linear(dim, codebook_input_dim) if requires_projection else nn.Identity() + self.project_out = nn.Linear(codebook_input_dim, dim) if requires_projection else nn.Identity() + + self.has_projections = requires_projection + + self.eps = eps + self.commitment_weight = commitment_weight + self.commitment_use_cross_entropy_loss = commitment_use_cross_entropy_loss # whether to use cross entropy loss to codebook as commitment loss + + self.learnable_codebook = learnable_codebook + + has_codebook_orthogonal_loss = orthogonal_reg_weight > 0 + self.has_codebook_orthogonal_loss = has_codebook_orthogonal_loss + self.orthogonal_reg_weight = orthogonal_reg_weight + self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only + self.orthogonal_reg_max_codes = orthogonal_reg_max_codes + + assert not (ema_update and learnable_codebook), 'learnable codebook not compatible with EMA update' + + assert 0 <= sync_update_v <= 1. + assert not (sync_update_v > 0. and not learnable_codebook), 'learnable codebook must be turned on' + + self.sync_update_v = sync_update_v + + codebook_class = EuclideanCodebook if not use_cosine_sim else CosineSimCodebook + + gumbel_sample_fn = partial( + gumbel_sample, + stochastic = stochastic_sample_codes, + reinmax = reinmax, + straight_through = straight_through + ) + + if not exists(sync_codebook): + sync_codebook = distributed.is_initialized() and distributed.get_world_size() > 1 + + codebook_kwargs = dict( + dim = codebook_dim, + num_codebooks = heads if separate_codebook_per_head else 1, + codebook_size = codebook_size, + kmeans_init = kmeans_init, + kmeans_iters = kmeans_iters, + sync_kmeans = sync_kmeans, + decay = decay, + eps = eps, + threshold_ema_dead_code = threshold_ema_dead_code, + use_ddp = sync_codebook, + learnable_codebook = has_codebook_orthogonal_loss or learnable_codebook, + sample_codebook_temp = sample_codebook_temp, + gumbel_sample = gumbel_sample_fn, + ema_update = ema_update + ) + + if affine_param: + assert not use_cosine_sim, 'affine param is only compatible with euclidean codebook' + codebook_kwargs = dict( + **codebook_kwargs, + affine_param = True, + sync_affine_param = sync_affine_param, + affine_param_batch_decay = affine_param_batch_decay, + affine_param_codebook_decay = affine_param_codebook_decay, + ) + + self._codebook = codebook_class(**codebook_kwargs) + + self.in_place_codebook_optimizer = in_place_codebook_optimizer(self._codebook.parameters()) if exists(in_place_codebook_optimizer) else None + + self.codebook_size = codebook_size + self.register_buffer('codebook_usage', torch.zeros(codebook_size)) + self.call_cnt = 0 + + self.accept_image_fmap = accept_image_fmap + self.channel_last = channel_last + + @property + def codebook(self): + codebook = self._codebook.embed + + if self.separate_codebook_per_head: + return codebook + + return rearrange(codebook, '1 ... -> ...') + + @codebook.setter + def codebook(self, codes): + if not self.separate_codebook_per_head: + codes = rearrange(codes, '... -> 1 ...') + + self._codebook.embed.copy_(codes) + + def get_codes_from_indices(self, indices): + codebook = self.codebook + is_multiheaded = codebook.ndim > 2 + + if not is_multiheaded: + codes = codebook[indices] + return rearrange(codes, '... h d -> ... (h d)') + + indices, ps = pack_one(indices, 'b * h') + indices = rearrange(indices, 'b n h -> b h n') + + indices = repeat(indices, 'b h n -> b h n d', d = codebook.shape[-1]) + codebook = repeat(codebook, 'h n d -> b h n d', b = indices.shape[0]) + + codes = codebook.gather(2, indices) + codes = rearrange(codes, 'b h n d -> b n (h d)') + codes = unpack_one(codes, ps, 'b * d') + return codes + + def get_output_from_indices(self, indices): + codes = self.get_codes_from_indices(indices) + return self.project_out(codes) + + def get_perplexity(self, encoding_indices, x): + encode_onehot = F.one_hot(encoding_indices, self.codebook_size).type_as(x) # [bthw, ncode] + avg_probs = torch.mean(encode_onehot, dim=0) + perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10))) + return perplexity + + + def get_usage(self, encoding_indices): + # Flatten the batch of encoding indices into a single 1D tensor + all_indices = encoding_indices.flatten() + + # Obtain the total number of encoding indices in the batch to calculate percentages + total_indices = all_indices.numel() + + # Initialize a tensor to store the percentage usage of each code + codebook_usage_percentage = torch.zeros(self.codebook_size, device=all_indices.device) + + # Count the number of occurrences of each index and get their frequency as percentages + unique_indices, counts = torch.unique(all_indices, return_counts=True) + + # Calculate the percentage + percentages = (counts.float() / total_indices) + + # Populate the corresponding percentages in the codebook_usage_percentage tensor + codebook_usage_percentage[unique_indices.long()] = percentages + + return codebook_usage_percentage + + + def forward( + self, + x, + indices = None, + mask = None, + sample_codebook_temp = None, + freeze_codebook = False, + ): + orig_input = x + + only_one = x.ndim == 2 + + if only_one: + assert not exists(mask) + x = rearrange(x, 'b d -> b 1 d') + + shape, device, heads, is_multiheaded, codebook_size, return_loss = x.shape, x.device, self.heads, self.heads > 1, self.codebook_size, exists(indices) + + need_transpose = not self.channel_last and not self.accept_image_fmap + should_inplace_optimize = exists(self.in_place_codebook_optimizer) + + # rearrange inputs + + if self.accept_image_fmap: + nframes, height, width = x.shape[-3:] + x = rearrange(x, 'b c t h w -> b (t h w) c') + + if need_transpose: + x = rearrange(x, 'b d n -> b n d') + + # project input + + x = self.project_in(x) + + # handle multi-headed separate codebooks + + if is_multiheaded: + ein_rhs_eq = 'h b n d' if self.separate_codebook_per_head else '1 (b h) n d' + x = rearrange(x, f'b n (h d) -> {ein_rhs_eq}', h = heads) + + # l2norm for cosine sim, otherwise identity + + x = self._codebook.transform_input(x) + + # codebook forward kwargs + + codebook_forward_kwargs = dict( + sample_codebook_temp = sample_codebook_temp, + mask = mask, + freeze_codebook = freeze_codebook + ) + + # quantize + + quantize, embed_ind, distances = self._codebook(x, **codebook_forward_kwargs) + + # one step in-place update + + if should_inplace_optimize and self.training and not freeze_codebook: + + if exists(mask): + loss = F.mse_loss(quantize, x.detach(), reduction = 'none') + + loss_mask = mask + if is_multiheaded: + loss_mask = repeat(mask, 'b n -> c (b h) n', c = loss.shape[0], h = loss.shape[1] // mask.shape[0]) + + loss = loss[loss_mask].mean() + + else: + loss = F.mse_loss(quantize, x.detach()) + + loss.backward() + self.in_place_codebook_optimizer.step() + self.in_place_codebook_optimizer.zero_grad() + + # quantize again + + quantize, embed_ind, distances = self._codebook(x, **codebook_forward_kwargs) + + if self.training: + # determine code to use for commitment loss + maybe_detach = torch.detach if not self.learnable_codebook or freeze_codebook else identity + + commit_quantize = maybe_detach(quantize) + + # straight through + + quantize = x + (quantize - x).detach() + + if self.sync_update_v > 0.: + # (21) in https://minyoungg.github.io/vqtorch/assets/draft_050523.pdf + quantize = quantize + self.sync_update_v * (quantize - quantize.detach()) + + # function for calculating cross entropy loss to distance matrix + # used for (1) naturalspeech2 training residual vq latents to be close to the correct codes and (2) cross-entropy based commitment loss + + def calculate_ce_loss(codes): + if not is_multiheaded: + dist_einops_eq = '1 b n l -> b l n' + elif self.separate_codebook_per_head: + dist_einops_eq = 'c b n l -> b l n c' + else: + dist_einops_eq = '1 (b h) n l -> b l n h' + + ce_loss = F.cross_entropy( + rearrange(distances, dist_einops_eq, b = shape[0]), + codes, + ignore_index = -1 + ) + + return ce_loss + + # if returning cross entropy loss on codes that were passed in + + if return_loss: + print(indices) + return quantize, calculate_ce_loss(indices) + + # transform embedding indices + + if is_multiheaded: + if self.separate_codebook_per_head: + embed_ind = rearrange(embed_ind, 'h b n -> b n h', h = heads) + else: + embed_ind = rearrange(embed_ind, '1 (b h) n -> b n h', h = heads) + + if self.accept_image_fmap: + embed_ind = rearrange(embed_ind, 'b (t h w) ... -> b t h w ...', t = nframes, h = height, w = width) + + if only_one: + embed_ind = rearrange(embed_ind, 'b 1 ... -> b ...') + + # aggregate loss + + loss = torch.tensor([0.], device = device, requires_grad = self.training) + + if self.training: + if self.commitment_weight > 0: + if self.commitment_use_cross_entropy_loss: + if exists(mask): + ce_loss_mask = mask + if is_multiheaded: + ce_loss_mask = repeat(ce_loss_mask, 'b n -> b n h', h = heads) + + embed_ind.masked_fill_(~ce_loss_mask, -1) + + print(embed_ind.shape, embed_ind) + commit_loss = calculate_ce_loss(embed_ind) + else: + if exists(mask): + # with variable lengthed sequences + commit_loss = F.mse_loss(commit_quantize, x, reduction = 'none') + + loss_mask = mask + if is_multiheaded: + loss_mask = repeat(loss_mask, 'b n -> c (b h) n', c = commit_loss.shape[0], h = commit_loss.shape[1] // mask.shape[0]) + + commit_loss = commit_loss[loss_mask].mean() + else: + commit_loss = F.mse_loss(commit_quantize, x) + + loss = loss + commit_loss * self.commitment_weight + + if self.has_codebook_orthogonal_loss: + codebook = self._codebook.embed + + # only calculate orthogonal loss for the activated codes for this batch + + if self.orthogonal_reg_active_codes_only: + assert not (is_multiheaded and self.separate_codebook_per_head), 'orthogonal regularization for only active codes not compatible with multi-headed with separate codebooks yet' + unique_code_ids = torch.unique(embed_ind) + codebook = codebook[:, unique_code_ids] + + num_codes = codebook.shape[-2] + + if exists(self.orthogonal_reg_max_codes) and num_codes > self.orthogonal_reg_max_codes: + rand_ids = torch.randperm(num_codes, device = device)[:self.orthogonal_reg_max_codes] + codebook = codebook[:, rand_ids] + + orthogonal_reg_loss = orthogonal_loss_fn(codebook) + loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight + + # handle multi-headed quantized embeddings + + if is_multiheaded: + if self.separate_codebook_per_head: + quantize = rearrange(quantize, 'h b n d -> b n (h d)', h = heads) + else: + quantize = rearrange(quantize, '1 (b h) n d -> b n (h d)', h = heads) + + # project out + + quantize = self.project_out(quantize) + + # rearrange quantized embeddings + + if need_transpose: + quantize = rearrange(quantize, 'b n d -> b d n') + + if self.accept_image_fmap: + quantize = rearrange(quantize, 'b (t h w) c -> b c t h w', t = nframes, h = height, w = width) + + if only_one: + quantize = rearrange(quantize, 'b 1 d -> b d') + + # if masking, only return quantized for where mask has True + + if exists(mask): + quantize = torch.where( + rearrange(mask, '... -> ... 1'), + quantize, + orig_input + ) + + # return quantize, embed_ind, loss + perplexity = self.get_perplexity(embed_ind, x) + usage = self.get_usage(embed_ind) + + if self.call_cnt == 0: + self.codebook_usage.data = usage + else: + self.codebook_usage.data = 0.99 * self.codebook_usage.data + (1 - 0.99) * usage + + self.call_cnt += 1 + # avg_distribution = self.codebook_usage.data.sum() / self.codebook_size + avg_usage = (self.codebook_usage.data > (1/self.codebook_size)).sum() / self.codebook_size + + return dict(embeddings=quantize, encodings=embed_ind, + commitment_loss=loss, perplexity=perplexity, avg_usage=avg_usage, batch_usage=usage) \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__init__.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2315f167ec0a462e30e0b80aeadab42698d42bd3 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from .arguments import str2bool \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9716eeb294653e61c4c7d59c7c03fa95dcd6686 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/arguments.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/arguments.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df66be431614cc6e4ffe08494eb5d8c7bf32b5cb Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/arguments.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/context_parallel.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/context_parallel.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68c5213142fc37be12479a2583aa6ca3c1ba815d Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/context_parallel.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/dynamic_resolution.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/dynamic_resolution.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08821bcb120958ff7a2e3b88701861a35a53b1d2 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/dynamic_resolution.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/dynamic_resolution_two_pyramid.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/dynamic_resolution_two_pyramid.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56c5a379e87659768601562942e7bdfabb5be63f Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/dynamic_resolution_two_pyramid.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/misc.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3baeb32f0e91e04640e94a107c5464cbce557544 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/__pycache__/misc.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/arguments.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/arguments.py new file mode 100644 index 0000000000000000000000000000000000000000..e47a334430f97185fc1581d52b746a0bd54ca41f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/arguments.py @@ -0,0 +1,253 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import argparse + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('true'): + return True + elif v.lower() in ('false'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +def add_model_specific_args(args, parser): + from infinity.models.videovae.models import CVIVIT_VQGAN, CNN_VQGAN, FLUX_VAE, MS_VAE, CogVAE, SlowFastVAE, HunYuanVAE, CogVAEL + + if args.tokenizer == "cvivit": + parser = CVIVIT_VQGAN.add_model_specific_args(parser) + vae_model = CVIVIT_VQGAN + elif args.tokenizer == "cnn": + parser = CNN_VQGAN.add_model_specific_args(parser) + vae_model = CNN_VQGAN + elif args.tokenizer in ["flux"]: # add cogvideo here to align evaluation configs + parser = CNN_VQGAN.add_model_specific_args(parser) # align with cnn config + parser = FLUX_VAE.add_model_specific_args(parser) # flux config + vae_model = FLUX_VAE + elif args.tokenizer == "ms": + parser = CNN_VQGAN.add_model_specific_args(parser) # align with cnn config + parser = FLUX_VAE.add_model_specific_args(parser) # align with flux config + vae_model = MS_VAE + elif args.tokenizer in ["sd", "sd-vq", "mar", "cogvideox_origin", "vidtok", "open-sora-plan", "step-fun", "hunyuan_origin"]: + vae_model = None + pass + elif args.tokenizer in ["cogvideox"]: + parser = CogVAE.add_model_specific_args(parser) + parser = FLUX_VAE.add_model_specific_args(parser) + vae_model = CogVAE + elif args.tokenizer in ["cogvideoxl"]: + parser = CogVAEL.add_model_specific_args(parser) + parser = FLUX_VAE.add_model_specific_args(parser) + vae_model = CogVAEL + elif args.tokenizer in ["slow-fast"]: + parser = SlowFastVAE.add_model_specific_args(parser) + parser = FLUX_VAE.add_model_specific_args(parser) + vae_model = SlowFastVAE + elif args.tokenizer in ["hunyuan"]: + parser = HunYuanVAE.add_model_specific_args(parser) + parser = FLUX_VAE.add_model_specific_args(parser) + vae_model = HunYuanVAE + else: + raise NotImplementedError + return args, parser, vae_model + +class MainArgs: + @staticmethod + def add_main_args(parser): + # training + parser.add_argument('--max_steps', type=int, default=1e6) + parser.add_argument('--log_every', type=int, default=1) + parser.add_argument('--ckpt_every', type=int, default=1000) + parser.add_argument('--default_root_dir', type=str, required=True) + parser.add_argument('--compile', type=str, default="no", choices=["no", "yes"]) + parser.add_argument('--ema', type=str, default="no", choices=["no", "yes"]) + parser.add_argument('--mfu_logging', type=str, default="no", choices=["no", "yes"]) + parser.add_argument('--dataloader_init_epoch', type=int, default=-1) + parser.add_argument('--context_parallel_size', type=int, default=0) + + # optimization + parser.add_argument('--lr', type=float, default=1e-4) + parser.add_argument('--beta1', type=float, default=0.9) + parser.add_argument('--beta2', type=float, default=0.95) + parser.add_argument('--optim_type', type=str, default="Adam", choices=["Adam", "AdamW"]) + parser.add_argument('--disc_optim_type', type=str, default=None, choices=[None, "rmsprop"]) + parser.add_argument('--max_grad_norm', type=float, default=1.0) + parser.add_argument('--max_grad_norm_disc', type=float, default=1.0) + parser.add_argument('--disable_sch', action="store_true") # deprecated option + parser.add_argument('--scheduler', type=str, default="no", choices=["no", "linear"]) + parser.add_argument('--warmup_steps', type=int, default=0) + parser.add_argument('--lr_min', type=float, default=0.) + parser.add_argument('--warmup_lr_init', type=float, default=0.) + + # basic vae config + parser.add_argument('--patch_size', type=int, default=8) + parser.add_argument('--temporal_patch_size', type=int, default=4) + parser.add_argument('--embedding_dim', type=int, default=256) + parser.add_argument('--codebook_dim', type=int, default=16) + parser.add_argument('--use_vae', action="store_true") + parser.add_argument('--fix_model', type=str, default='no', choices=['no', 'encoder', 'encoder_decoder']) + + # discrete vae config + parser.add_argument('--use_stochastic_depth', action="store_true") + parser.add_argument("--drop_rate", type=float, default=0.0) + parser.add_argument('--schedule_mode', type=str, default="original", choices=["original", "dynamic", "dense", "same1", "same2", "same3", "half", "dense_f8", "dense_f8_double"]) + parser.add_argument('--lr_drop', nargs='*', type=int, default=None, help="A list of numeric values. Example: --values 270 300") + parser.add_argument('--lr_drop_rate', type=float, default=0.1) + parser.add_argument('--keep_first_quant', action="store_true") + parser.add_argument('--keep_last_quant', action="store_true") + parser.add_argument('--remove_residual_detach', action="store_true") + parser.add_argument('--use_out_phi', action="store_true") + parser.add_argument('--use_out_phi_res', action="store_true") + parser.add_argument('--use_lecam_reg', action="store_true") + parser.add_argument('--lecam_weight', type=float, default=0.05) + parser.add_argument('--perceptual_model', type=str, default="vgg16", choices=["vgg16", "resnet50", "resnet50_v2"]) + parser.add_argument('--base_ch_disc', type=int, default=64) + parser.add_argument('--random_flip', action="store_true") + parser.add_argument('--flip_prob', type=float, default=0.5) + parser.add_argument('--flip_mode', type=str, default="stochastic", choices=["stochastic", "deterministic", "stochastic_dynamic"]) + parser.add_argument('--max_flip_lvl', type=int, default=1) + parser.add_argument('--not_load_optimizer', action="store_true") + parser.add_argument('--use_lecam_reg_zero', action="store_true") + parser.add_argument('--freeze_encoder', action="store_true") + parser.add_argument('--rm_downsample', action="store_true") + parser.add_argument('--random_flip_1lvl', action="store_true") + parser.add_argument('--flip_lvl_idx', type=int, default=0) + parser.add_argument('--drop_when_test', action="store_true") + parser.add_argument('--drop_lvl_idx', type=int, default=None) + parser.add_argument('--drop_lvl_num', type=int, default=0) + parser.add_argument('--compute_all_commitment', action="store_true") + parser.add_argument('--disable_codebook_usage', action="store_true") + parser.add_argument('--freeze_enc_main', action="store_true") + parser.add_argument('--freeze_dec_main', action="store_true") + parser.add_argument('--random_short_schedule', action="store_true") + parser.add_argument('--short_schedule_prob', type=float, default=0.5) + parser.add_argument('--use_bernoulli', action="store_true") + parser.add_argument('--use_rot_trick', action="store_true") + parser.add_argument('--disable_flip_prob', type=float, default=0.0) + parser.add_argument('--dino_disc', action="store_true") + parser.add_argument('--quantizer_type', type=str, default='MultiScaleBSQ') + parser.add_argument('--lfq_weight', type=float, default=0.) + parser.add_argument('--entropy_loss_weight', type=float, default=0.1) + parser.add_argument('--visu_every', type=int, default=1000) + parser.add_argument('--commitment_loss_weight', type=float, default=0.25) + parser.add_argument('--bsq_version', type=str, default="v1", choices=["v1", "v2"]) + parser.add_argument('--diversity_gamma', type=float, default=1) + parser.add_argument('--bs1_for1024', action="store_true") + parser.add_argument('--casual_multi_scale', action="store_true") + parser.add_argument('--double_compress_t', action="store_true") + parser.add_argument('--temporal_slicing', action="store_true") + parser.add_argument('--latent_adjust_type', type=str, default=None) + parser.add_argument('--compute_latent_loss', action="store_true") + parser.add_argument('--latent_loss_weight', type=float, default=0.0) + + # discriminator config + parser.add_argument('--disc_version', type=str, default="v1") + parser.add_argument('--magvit_disc', action="store_true") # deprecated + parser.add_argument('--disc_type', type=str, default="patchgan", choices=["patchgan", "stylegan"]) + parser.add_argument('--sigmoid_in_disc', action="store_true") + parser.add_argument('--activation_in_disc', type=str, default="leaky_relu") + parser.add_argument('--apply_blur', action="store_true") + parser.add_argument('--apply_noise', action="store_true") + parser.add_argument('--dis_warmup_steps', type=int, default=0) + parser.add_argument('--dis_lr_multiplier', type=float, default=1.) + parser.add_argument('--dis_minlr_multiplier', action="store_true") + parser.add_argument('--disc_channels', type=int, default=64) + parser.add_argument('--disc_layers', type=int, default=3) + parser.add_argument('--discriminator_iter_start', type=int, default=0) + parser.add_argument('--disc_pretrain_iter', type=int, default=0) + parser.add_argument('--disc_optim_steps', type=int, default=1) + parser.add_argument('--disc_warmup', type=int, default=0) + parser.add_argument('--disc_pool', type=str, default="no", choices=["no", "yes"]) + parser.add_argument('--disc_pool_size', type=int, default=100) + parser.add_argument('--disc_temporal_compress', type=str, default="yes", choices=["no", "yes"]) + parser.add_argument('--disc_use_blur', type=str, default="yes", choices=["no", "yes"]) + parser.add_argument('--disc_stylegan_downsample_base', type=int, default=2) + + parser = MainArgs.add_loss_args(parser) + parser = MainArgs.add_accelerate_args(parser) + + # initialization + parser.add_argument('--tokenizer', type=str, required=True) + parser.add_argument('--pretrained', type=str, default=None) + parser.add_argument('--pretrained_mode', type=str, default="full") + parser.add_argument('--pretrained_ema', type=str, default="no") + parser.add_argument('--inflation_pe', action="store_true") + parser.add_argument('--init_vgen', type=str, default='no', choices=['no', 'keep', 'average']) + parser.add_argument('--no_init_idis', action="store_true") # deprecated option + parser.add_argument('--init_idis', type=str, default='keep', choices=['no', 'keep']) # use keep by default following previous settings + parser.add_argument('--init_vdis', type=str, default="no") + + # misc + parser.add_argument('--enable_nan_detector', action='store_true') + parser.add_argument('--turn_on_profiler', action='store_true') + parser.add_argument('--profiler_scheduler_wait_steps', type=int, default=10) + parser.add_argument('--debug', action='store_true') + parser.add_argument('--video_logger', action='store_true') # deprecated option + parser.add_argument('--bytenas', type=str, default="sg") + parser.add_argument('--username', type=str, default="zhufengda") + parser.add_argument('--seed', type=int, default=1234) + parser.add_argument('--vq_to_vae', action='store_true') + parser.add_argument('--load_not_strict', action='store_true') + parser.add_argument('--zero', type=int, default=0, choices=[0, 1, 2, 3]) # 1 hybrid shard, 2 shard grad_op, 3 full shard + parser.add_argument('--bucket_cap_mb', type=int, default=40) # DDP + parser.add_argument('--manual_gc_interval', type=int, default=10000) # DDP + + return parser + + @staticmethod + def add_loss_args(parser): + parser.add_argument("--recon_loss_type", type=str, default='l1', choices=['l1', 'l2']) + parser.add_argument('--video_perceptual_weight', type=float, default=0.) + parser.add_argument('--image_gan_weight', type=float, default=1.0) + parser.add_argument('--video_gan_weight', type=float, default=1.0) + parser.add_argument('--image_disc_weight', type=float, default=0.) + parser.add_argument('--video_disc_weight', type=float, default=0.) + parser.add_argument('--l1_weight', type=float, default=4.0) + parser.add_argument('--gan_feat_weight', type=float, default=0.0) + parser.add_argument('--lpips_model', type=str, default='vgg', choices=['vgg', 'resnet50']) + parser.add_argument('--perceptual_weight', type=float, default=0.0) + parser.add_argument('--kl_weight', type=float, default=0.) + parser.add_argument('--norm_type', type=str, default='group', choices=['batch', 'group', "no"]) + parser.add_argument('--disc_loss_type', type=str, default='hinge', choices=['hinge', 'vanilla']) + parser.add_argument('--gan_image4video', type=str, default='yes', choices=['no', 'yes']) + return parser + + @staticmethod + def add_accelerate_args(parser): + parser.add_argument('--use_checkpoint', action="store_true") + parser.add_argument('--precision', type=str, default="fp32", choices=['fp32', 'bf16']) # disable fp16 + parser.add_argument('--encoder_dtype', type=str, default="fp32", choices=['fp32', 'bf16']) # disable fp16 + parser.add_argument('--decoder_dtype', type=str, default="fp32", choices=['fp32', 'bf16']) # disable fp16 + parser.add_argument('--upcast_attention', type=str, default="", choices=["qk", "qkv"]) + parser.add_argument('--upcast_tf32', action="store_true") + return parser + +def format_args(args): + # Start building the script string + script_content = "#!/bin/bash\n\n" + script_content += "torchrun \\\n" + script_content += " --nproc_per_node=$ARNOLD_WORKER_GPU \\\n" + script_content += " --nnodes=$ARNOLD_WORKER_NUM --master_addr=$ARNOLD_WORKER_0_HOST \\\n" + script_content += " --node_rank=$ARNOLD_ID --master_port=$port \\\n" + script_content += " train.py \\\n" + + # Iterate over each key-value pair and append it to the command + for k, v in args.__dict__.items(): + script_content += f" --{k} {v} \\\n" + + # Remove the last backslash and newline + script_content = script_content.rstrip(" \\\n") + "\n" + return script_content + +def init_resolution(resolution, num_datasets): + if len(resolution) == 1: + resolution = [(resolution[0], resolution[0])] * num_datasets + elif len(resolution) == num_datasets: + resolution = [(resolution[i], resolution[i]) for i in range(len(resolution))] + elif len(resolution) == num_datasets * 2: + resolution = [(resolution[i], resolution[i+1]) for i in range(0, len(resolution), 2)] + else: + raise NotImplementedError + return resolution diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/context_parallel.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/context_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..e34d4e8e0482fa49c13c5661b5b5a09df679c686 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/context_parallel.py @@ -0,0 +1,172 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import math +import torch +import torch.nn as nn +import torch.distributed as dist + +import infinity.models.videovae.utils.diffdist.functional as distops + +class ContextParallelUtils: + _CONTEXT_PARALLEL_GROUP = None + _CONTEXT_PARALLEL_SIZE = 0 + _CONTEXT_PARALLEL_ON = False + + """ + { + "cp_size": 2, + } + """ + CP_CONFIG = None + + @staticmethod + def set_cp_on(on=True): + ContextParallelUtils._CONTEXT_PARALLEL_ON = on + + @staticmethod + def cp_on(): + return ContextParallelUtils._CONTEXT_PARALLEL_ON + + @staticmethod + def get_cp_cfg(): + return ContextParallelUtils.CP_CONFIG + + @staticmethod + def is_cp_initialized(): + if ContextParallelUtils._CONTEXT_PARALLEL_GROUP is None: + return False + else: + return True + + @staticmethod + def initialize_context_parallel(cp_config:dict): + assert ContextParallelUtils._CONTEXT_PARALLEL_GROUP is None, "context parallel group is already initialized" + + context_parallel_size = cp_config["cp_size"] + if context_parallel_size > 1: + ContextParallelUtils.CP_CONFIG = cp_config + else: + print(f"WARN: context parallel size must > 1 but got {context_parallel_size}") + return + + ContextParallelUtils._CONTEXT_PARALLEL_SIZE = context_parallel_size + + rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + + for i in range(0, world_size, context_parallel_size): + ranks = range(i, i + context_parallel_size) + group = torch.distributed.new_group(ranks) + if rank in ranks: + ContextParallelUtils._CONTEXT_PARALLEL_GROUP = group + break + + @staticmethod + def get_cp_group(): + return ContextParallelUtils._CONTEXT_PARALLEL_GROUP + + @staticmethod + def get_cp_size(): + return ContextParallelUtils._CONTEXT_PARALLEL_SIZE + + @staticmethod + def get_cp_world_size(): + if ContextParallelUtils.is_cp_initialized(): + world_size = torch.distributed.get_world_size() + return world_size // ContextParallelUtils._CONTEXT_PARALLEL_SIZE + else: + return 0 + + @staticmethod + def get_cp_rank(): + if ContextParallelUtils.is_cp_initialized(): + global_rank = torch.distributed.get_rank() + cp_rank = global_rank % ContextParallelUtils._CONTEXT_PARALLEL_SIZE + return cp_rank + else: + return 0 + + def get_cp_group_rank(): + if ContextParallelUtils.is_cp_initialized(): + rank = torch.distributed.get_rank() + cp_group_rank = rank // ContextParallelUtils._CONTEXT_PARALLEL_SIZE + return cp_group_rank + else: + return 0 + + +def _gather_tensor_shape(local_ts): + cp_size = ContextParallelUtils.get_cp_size() + local_shape = torch.tensor(local_ts.shape, dtype=torch.int64, device=local_ts.device) + gathered_shapes = [torch.zeros(len(local_shape), dtype=torch.int64, device=local_ts.device) for _ in range(cp_size)] + dist.all_gather(gathered_shapes, local_shape, group=ContextParallelUtils._CONTEXT_PARALLEL_GROUP) + return [shape.tolist() for shape in gathered_shapes] + +@torch.compiler.disable() +def dist_encoder_gather_result(res)->list: + cp_size = ContextParallelUtils.get_cp_size() + if cp_size < 2: + return res + + shape_list = _gather_tensor_shape(res) # [[1,2,3,4],[x,x,x,x]] list of shapes on different rank + encs=[torch.zeros(s, device=res.device, dtype=res.dtype) for s in shape_list] + + dist.barrier() + encs = distops.all_gather(encs, res, group=ContextParallelUtils._CONTEXT_PARALLEL_GROUP) + return encs + +@torch.compiler.disable() +def dist_decoder_gather_result(res)->list: + cp_size = ContextParallelUtils.get_cp_size() + if cp_size < 2: + return res + + shape_list = _gather_tensor_shape(res) # [[1,2,3,4],[x,x,x,x]] list of shapes on different rank + decs = [torch.zeros(s, device=res.device, dtype=res.dtype) for s in shape_list] + + dist.barrier() + decs = distops.all_gather(decs, res, group=ContextParallelUtils._CONTEXT_PARALLEL_GROUP) + return decs + + +def _send_with_shape(local_ts, next_rank): + local_shape = torch.tensor(local_ts.shape, dtype=torch.int64, device=local_ts.device) + torch.distributed.send(local_shape.contiguous(), next_rank) + torch.distributed.send(local_ts.contiguous(), next_rank) + +def _recv_with_shape(pre_rank): + device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu') + + shape = torch.zeros(5, dtype=torch.int64, device=device) + torch.distributed.recv(shape, pre_rank) + ts = torch.zeros(shape.tolist(), device=device) + torch.distributed.recv(ts, pre_rank) + return ts + + +@torch.compiler.disable() +def dist_conv_cache_send(conv_cache): + + cp_rank = ContextParallelUtils.get_cp_rank() + global_rank = torch.distributed.get_rank() + cp_size = ContextParallelUtils.get_cp_size() + + if cp_rank == cp_size - 1: + return + if conv_cache is None: + return + + next_rank = global_rank + 1 + _send_with_shape(conv_cache, next_rank) + +@torch.compiler.disable() +def dist_conv_cache_recv(): + cp_rank = ContextParallelUtils.get_cp_rank() + global_rank = torch.distributed.get_rank() + + if cp_rank == 0: + return None + + pre_rank = global_rank - 1 + return _recv_with_shape(pre_rank) + diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/README.md b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a16b530b25182895b3d8050afe8353c9cd74569c --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/README.md @@ -0,0 +1,9 @@ +# diffdist, for differentiable communication + +borrowed from https://github.com/ag14774/diffdist and fix code: +``` + # tmp = dist.reduce(tensor_list[i], i, op, group, async_op=True) + # to + # tmp = dist.reduce(tensor_list[i].contiguous(), i, op, group, async_op=True) + +``` diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__init__.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d37e16bfcacf559e32a20f065be5f911c877c7f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from infinity.models.videovae.utils.diffdist import extra_collectives +from infinity.models.videovae.utils.diffdist import functional +from infinity.models.videovae.utils.diffdist import modules diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2be7df18cf4cb2b11d2ac8abb4ad6c2af109c80 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/extra_collectives.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/extra_collectives.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e23f94f439ce8b70b159cbbbde15ea3780b0556 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/extra_collectives.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/functional.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/functional.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c46817b36cf8baf3453958e4a62efcd0b51768fd Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/functional.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/functions.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/functions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d015930ba10926277cda91fd6d0f406abc3b46a Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/functions.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/modules.cpython-310.pyc b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/modules.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cdbc2c4c300d887fdbb68bd3735a4ea714c2dc4 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/__pycache__/modules.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/extra_collectives.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/extra_collectives.py new file mode 100644 index 0000000000000000000000000000000000000000..071e8021b72f3be8c6b054c4ad9a8e74bdf92ab3 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/extra_collectives.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch.distributed as dist +from torch.distributed import ReduceOp + + +class AsyncOpList(object): + def __init__(self, ops): + self.ops = ops + + def wait(self): + for op in self.ops: + op.wait() + + def is_completed(self): + for op in self.ops: + if not op.is_completed(): + return False + return True + + +def reduce_scatter(tensor, + tensor_list, + op=ReduceOp.SUM, + group=dist.group.WORLD, + async_op=False): + ranks = dist.get_process_group_ranks(group) + rank = dist.get_rank(group) + if tensor is None: + tensor = tensor_list[rank] + if tensor.dim() == 0: + tensor = tensor.view(-1) + tensor[:] = tensor_list[rank] + ops = [] + for i in range(dist.get_world_size(group)): + if i == rank: + tmp = dist.reduce(tensor.contiguous(), ranks[i], op, group, async_op=True) + else: + tmp = dist.reduce(tensor_list[i].contiguous(), ranks[i], op, group, async_op=True) + ops.append(tmp) + + oplist = AsyncOpList(ops) + if async_op: + return oplist + else: + oplist.wait() diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/functional.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..af156d8d7d7f35fda8d60e7ae7e234ddf8657688 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/functional.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import infinity.models.videovae.utils.diffdist.modules as mods +import torch.distributed as dist + + +def consume_variable(tensor_to_consume, tensors_to_return, set_ones_grad=True): + return mods.ConsumeVariable(set_ones_grad)(tensor_to_consume, + *tensors_to_return) + + +def send(tensor, dst, group=dist.group.WORLD, tag=0): + return mods.Send(dst, group, tag)(tensor) + + +def recv(tensor, + src=None, + group=dist.group.WORLD, + tag=0, + next_backprop=None, + inplace=True): + return mods.Recv(src, group, tag, next_backprop, inplace)(tensor) + + +def broadcast(tensor, + src, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + return mods.Broadcast(src, group, next_backprop, inplace)(tensor) + + +def gather(tensor, + gather_list=None, + dst=None, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + return mods.Gather(dst, group, next_backprop, inplace)(tensor, gather_list) + + +def scatter(tensor, + scatter_list=None, + src=None, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + return mods.Scatter(src, group, next_backprop, inplace)(tensor, + scatter_list) + + +def all_gather(gather_list, + tensor, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + return mods.AllGather(group, next_backprop, inplace)(gather_list, tensor) diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/functions.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..a11ea2eb1af26368edab9518e48139889b237854 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/functions.py @@ -0,0 +1,198 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from torch.autograd import Function +import infinity.models.videovae.utils.diffdist.extra_collectives as dist_extra +import torch.distributed as dist +import torch + + +class ConsumeVariableFunc(Function): + @staticmethod + def forward(ctx, tensor_to_consume, set_ones_grad, *tensors_to_return): + ctx.save_for_backward(tensor_to_consume) + ctx.set_ones_grad = set_ones_grad + return tensors_to_return + + @staticmethod + def backward(ctx, *grad_outputs): + tensor_to_consume, = ctx.saved_tensors + if ctx.set_ones_grad: + fake_grad = torch.ones_like(tensor_to_consume) + else: + fake_grad = torch.zeros_like(tensor_to_consume) + + return (fake_grad, None) + grad_outputs + + +class SendFunc(Function): + @staticmethod + def forward(ctx, tensor, dst, group=dist.group.WORLD, tag=0): + ctx.save_for_backward(tensor) + ctx.dst = dst + ctx.group = group + ctx.tag = tag + dist.send(tensor, dst, group, tag) + return tensor.new_tensor([]) + + @staticmethod + def backward(ctx, grad_output): + tensor, = ctx.saved_tensors + # TODO: Add ctx.needs_input_grad check + grad_tensor = torch.zeros_like(tensor) + dist.recv(grad_tensor, ctx.dst, ctx.group, ctx.tag) + + return grad_tensor, None, None, None + + +class RecvFunc(Function): + @staticmethod + def forward(ctx, + tensor, + src=None, + group=dist.group.WORLD, + tag=0, + inplace=True): + if not inplace: + tensor = torch.zeros_like(tensor).requires_grad_(False) + ctx.src = src + ctx.group = group + ctx.tag = tag + sender = dist.recv(tensor, src, group, tag) + if src: + assert sender == src + else: + ctx.src = sender + sender = torch.tensor(sender) + ctx.mark_non_differentiable(sender) + return tensor, sender + + @staticmethod + def backward(ctx, grad_tensor, grad_sender): + dist.send(grad_tensor, ctx.src, ctx.group, ctx.tag) + return grad_tensor, None, None, None, None + + +class BroadcastFunc(Function): + @staticmethod + def forward(ctx, tensor, src, group=dist.group.WORLD, inplace=True): + ctx.src = src + ctx.group = group + if dist.get_rank(group) == src: + if not inplace: + with torch.no_grad(): + tensor = tensor.clone().requires_grad_(False) + else: + if not inplace: + tensor = torch.zeros_like(tensor).requires_grad_(False) + dist.broadcast(tensor, src, group) + return tensor + + @staticmethod + def backward(ctx, grad_output): + dist.reduce(grad_output, + ctx.src, + op=dist.ReduceOp.SUM, + group=ctx.group) + return grad_output, None, None, None + + +class AllReduceFunc(Function): + @staticmethod + def forward(ctx, i): + raise NotImplementedError + + @staticmethod + def backward(ctx, grad_output): + raise NotImplementedError + + +class ReduceFunc(Function): + @staticmethod + def forward(ctx, i): + raise NotImplementedError + + @staticmethod + def backward(ctx, grad_output): + raise NotImplementedError + + +class AllGatherFunc(Function): + @staticmethod + def forward(ctx, tensor, group, inplace, *gather_list): + ctx.save_for_backward(tensor) + ctx.group = group + gather_list = list(gather_list) + if not inplace: + gather_list = [torch.zeros_like(g) for g in gather_list] + dist.all_gather(gather_list, tensor, group) + return tuple(gather_list) + + @staticmethod + def backward(ctx, *grads): + input, = ctx.saved_tensors + grad_out = torch.zeros_like(input) + dist_extra.reduce_scatter(grad_out, list(grads), group=ctx.group) + return (grad_out, None, None) + grads + + +class GatherFunc(Function): + @staticmethod + def forward(ctx, input, dst, group, inplace, *gather_list): + ctx.dst = dst + ctx.group = group + ctx.save_for_backward(input) + if dist.get_rank(group) == dst: + gather_list = list(gather_list) + if not inplace: + gather_list = [torch.zeros_like(g) for g in gather_list] + dist.gather(input, gather_list=gather_list, dst=dst, group=group) + return tuple(gather_list) + else: + dist.gather(input, [], dst=dst, group=group) + return input.new_tensor([]) + + @staticmethod + def backward(ctx, *grads): + input, = ctx.saved_tensors + grad_input = torch.zeros_like(input) + if dist.get_rank(ctx.group) == ctx.dst: + grad_outputs = list(grads) + dist.scatter(grad_input, + grad_outputs, + src=ctx.dst, + group=ctx.group) + return (grad_input, None, None, None) + grads + else: + dist.scatter(grad_input, [], src=ctx.dst, group=ctx.group) + return grad_input, None, None, None, None + + +class ScatterFunc(Function): + @staticmethod + def forward(ctx, + tensor, + src, + group=dist.group.WORLD, + inplace=True, + *scatter_list): + ctx.src = src + ctx.group = group + if not inplace: + tensor = torch.zeros_like(tensor) + if dist.get_rank(group) == src: + ctx.save_for_backward(*scatter_list) + scatter_list = list(scatter_list) + dist.scatter(tensor, scatter_list, src=src, group=group) + else: + dist.scatter(tensor, [], src=src, group=group) + return tensor + + @staticmethod + def backward(ctx, grad_tensor): + if dist.get_rank(ctx.group) == ctx.src: + grad_outputs = [torch.zeros_like(g) for g in ctx.saved_tensors] + dist.gather(grad_tensor, grad_outputs, ctx.src, group=ctx.group) + return (grad_tensor, None, None, None) + tuple(grad_outputs) + else: + dist.gather(grad_tensor, [], ctx.src, group=ctx.group) + return grad_tensor, None, None, None, None diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/modules.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..760d637d9f0e7f94a3a47de0f4f459f9591a331f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/modules.py @@ -0,0 +1,157 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch.nn as nn +import torch.distributed as dist +import infinity.models.videovae.utils.diffdist.functions as funcs + + +class ConsumeVariable(nn.Module): + def __init__(self, set_ones_grad=True): + """ + If set_ones_grad=True then the gradient w.r.t tensor_to_consume + is set to 1 during backprop. Otherwise, it is set to 0. + """ + super(ConsumeVariable, self).__init__() + self.set_ones_grad = set_ones_grad + + def forward(self, tensor_to_consume, *tensors_to_return): + tensors_to_return = funcs.ConsumeVariableFunc.apply( + tensor_to_consume, self.set_ones_grad, *tensors_to_return) + return tensors_to_return + + +class Send(nn.Module): + def __init__(self, dst, group=dist.group.WORLD, tag=0): + super(Send, self).__init__() + self.dst = dst + self.group = group + self.tag = tag + + def forward(self, tensor): + return funcs.SendFunc.apply(tensor, self.dst, self.group, self.tag) + + +class Recv(nn.Module): + def __init__(self, + src=None, + group=dist.group.WORLD, + tag=0, + next_backprop=None, + inplace=True): + super(Recv, self).__init__() + self.next_backprop = next_backprop + self.src = src + self.group = group + self.tag = tag + self.inplace = inplace + + self.consume = None + if self.next_backprop is not None: + self.consume = ConsumeVariable() + + def forward(self, tensor): + if self.consume: + tensor, = self.consume(self.next_backprop, tensor) + tensor, sender = funcs.RecvFunc.apply(tensor, self.src, self.group, + self.tag, self.inplace) + return tensor, sender.item() + + +class Broadcast(nn.Module): + def __init__(self, + src, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + super(Broadcast, self).__init__() + self.src = src + self.group = group + self.next_backprop = next_backprop + self.inplace = inplace + + self.consume = None + if self.next_backprop is not None: + self.consume = ConsumeVariable() + + def forward(self, tensor): + if self.consume: + tensor, = self.consume(self.next_backprop, tensor) + return funcs.BroadcastFunc.apply(tensor, self.src, self.group, + self.inplace) + + +class Gather(nn.Module): + def __init__(self, + dst=None, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + super(Gather, self).__init__() + self.dst = dst + self.group = group + self.next_backprop = next_backprop + self.inplace = inplace + + self.consume = None + if self.next_backprop is not None: + self.consume = ConsumeVariable() + + def forward(self, tensor, gather_list=None): + if self.consume: + tensor, = self.consume(self.next_backprop, tensor) + if dist.get_rank(self.group) == self.dst: + return list( + funcs.GatherFunc.apply(tensor, self.dst, self.group, + self.inplace, *gather_list)) + else: + return funcs.GatherFunc.apply(tensor, self.dst, self.group, + self.inplace, None) + + +class Scatter(nn.Module): + def __init__(self, + src=None, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + super(Scatter, self).__init__() + self.src = src + self.group = group + self.next_backprop = next_backprop + self.inplace = inplace + + self.consume = None + if self.next_backprop is not None: + self.consume = ConsumeVariable() + + def forward(self, tensor, scatter_list=None): + if self.consume: + tensor, = self.consume(self.next_backprop, tensor) + if dist.get_rank(self.group) == self.src: + return funcs.ScatterFunc.apply(tensor, self.src, self.group, + self.inplace, *scatter_list) + else: + return funcs.ScatterFunc.apply(tensor, self.src, self.group, + self.inplace, None) + + +class AllGather(nn.Module): + def __init__(self, + group=dist.group.WORLD, + next_backprop=None, + inplace=True): + super(AllGather, self).__init__() + self.group = group + self.next_backprop = next_backprop + self.inplace = inplace + + self.consume = None + if self.next_backprop is not None: + self.consume = ConsumeVariable() + + def forward(self, gather_list, tensor): + if self.consume: + tensor, = self.consume(self.next_backprop, tensor) + return list( + funcs.AllGatherFunc.apply(tensor, self.group, self.inplace, + *gather_list)) diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/testing.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8c271695ed93c1c954a83056381bc94f20f151 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/diffdist/testing.py @@ -0,0 +1,275 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import infinity.models.videovae.utils.diffdist.functional as distops +import torch.distributed as dist +import torch +import infinity.models.videovae.utils.diffdist.extra_collectives as extra_comm + + +def test_reduce_scatter(): + if dist.get_rank() == 0: + print("REDUCE_SCATTER TEST\n") + x = torch.arange(dist.get_world_size()).float().split(1) + buff = torch.tensor(0.) + extra_comm.reduce_scatter(buff, x) + print(dist.get_rank(), x) + print(dist.get_rank(), buff) + dist.barrier() + if dist.get_rank() == 0: + print('-' * 50) + + +def test_all_gather(): + if dist.get_rank() == 0: + print("ALL GATHER TEST\n") + dist.barrier() + x = torch.tensor(3., requires_grad=True) + y = (dist.get_rank() + 1) * x + + print(dist.get_rank(), "Sending y:", y) + z = distops.all_gather(list(torch.zeros(dist.get_world_size())), + y, + next_backprop=None, + inplace=True) + print(dist.get_rank(), "Received tensor:", z) + l = torch.sum(torch.stack(z)) + l = l * (dist.get_rank() + 1) + l.backward() + + print(dist.get_rank(), "Gradient with MPI:", x.grad) + dist.barrier() + if dist.get_rank() == 0: + print() + x = [ + torch.tensor(3., requires_grad=True) + for i in range(dist.get_world_size()) + ] + res = [] + for i in range(1, dist.get_world_size() + 1): + res.append(i * x[i - 1]) + + res2 = [] + for i in range(dist.get_world_size()): + temp = [] + for j in range(dist.get_world_size()): + temp.append(torch.clone(res[j])) + res2.append(temp) + l_s = [torch.sum(torch.stack(i)) for i in res2] + final = [(i + 1) * k for i, k in enumerate(l_s)] + for i in range(dist.get_world_size() - 1): + final[i].backward(retain_graph=True) + final[-1].backward() + for i, x_i in enumerate(x): + print(i, "Gradient in single process:", x_i.grad) + print('-' * 50) + + +def test_scatter(): + if dist.get_rank() == 0: + print("SCATTER TEST\n") + x = [ + torch.tensor(3., requires_grad=True) + for i in range(dist.get_world_size()) + ] + y = [2 * x_i for x_i in x] + + print("Sending y:", y) + buffer = torch.tensor(0.) + z = distops.scatter(buffer, y, src=0, inplace=False) + else: + buffer = torch.tensor(0., requires_grad=True) + z = distops.scatter(buffer, src=0, inplace=False) + + print(dist.get_rank(), "Received tensor:", z) + # Computation + k = (dist.get_rank() + 1) * z + k.backward() + + if dist.get_rank() == 0: + print("Gradient with MPI:", [x_i.grad for x_i in x]) + + if dist.get_rank() == 0: + print() + x = [ + torch.tensor(3., requires_grad=True) + for i in range(dist.get_world_size()) + ] + y = [2 * x_i for x_i in x] + res = [] + for i in range(dist.get_world_size()): + res.append((i + 1) * y[i]) + + for i, k in enumerate(res): + k.backward() + print("Gradient in single process:", [x_i.grad for x_i in x]) + dist.barrier() + if dist.get_rank() == 0: + print('-' * 50) + + +def test_gather(): + if dist.get_rank() == 0: + print("GATHER TEST\n") + dist.barrier() + x = torch.tensor(3., requires_grad=True) + y = (dist.get_rank() + 1) * x + + print(dist.get_rank(), "Sending y:", y) + if dist.get_rank() == 0: + z = distops.gather(y, + torch.zeros(dist.get_world_size()).split(1), + dst=0, + next_backprop=None, + inplace=True) + print(dist.get_rank(), "Received tensor:", z) + l = torch.sum(torch.stack(z)) + l.backward() + else: + dummy = distops.gather(y, dst=0, next_backprop=None, inplace=True) + dummy.backward(torch.tensor([])) + print(dist.get_rank(), "Gradient with MPI:", x.grad) + dist.barrier() + if dist.get_rank() == 0: + print() + x = [ + torch.tensor(3., requires_grad=True) + for i in range(dist.get_world_size()) + ] + res = [] + for i in range(1, dist.get_world_size() + 1): + res.append(i * x[i - 1]) + + z = torch.stack(res) + l = torch.sum(z) + l.backward() + for i, x_i in enumerate(x): + print(i, "Gradient in single process:", x_i.grad) + print('-' * 50) + + +def test_broadcast(): + if dist.get_rank() == 0: + print("BROADCAST TEST\n") + x = torch.tensor(3., requires_grad=True) + y = 2 * x + + print(dist.get_rank(), "Sending y:", y) + z = distops.broadcast(y, src=0, inplace=False) + print(dist.get_rank(), "Received tensor:", z) + + # Computation + k = 3 * z + k.backward() + print("Gradient with MPI:", x.grad) + + print() + x = torch.tensor(3., requires_grad=True) + y = 2 * x + res = [3 * y] + for i in range(1, dist.get_world_size()): + res.append(9 * y) + + for i, k in enumerate(res): + if i == (len(res) - 1): + k.backward() + else: + k.backward(retain_graph=True) + print("Gradient in single process:", x.grad) + else: + x = torch.tensor(5., requires_grad=True) + y = 7 * x + + buffer = torch.tensor(0.) + z = distops.broadcast(buffer, src=0, next_backprop=y) + print(dist.get_rank(), "Received tensor:", z) + k = 9 * z + k.backward() + print(dist.get_rank(), "Grad of disconnected part:", x.grad) + dist.barrier() + if dist.get_rank() == 0: + print('-' * 50) + + +def test_consume_variable(): + x = torch.tensor(5., requires_grad=True) + y = 2 * x + + z = 3 * y + j = 4 * y + + z = distops.consume_variable(j, [z], set_ones_grad=True)[0] + print(z) + z.backward() + print(x.grad) + print() + x = torch.tensor(5., requires_grad=True) + y = 2 * x + + z = 3 * y + j = 4 * y + + z.backward(retain_graph=True) + j.backward() + print(x.grad) + + +def test_send_recv(): + if dist.get_rank() == 0: + print("SEND/RECV TEST\n") + x = torch.tensor(3., requires_grad=True) + y = 2 * x + + print("Before sending y:", y) + connector = distops.send(y, dst=1) + # Computation happens in process 1 + buffer = torch.tensor(0.) + z, _ = distops.recv(buffer, src=1, next_backprop=connector) + print("After receiving:", z) + + k = 3 * z + k.backward() + print("Gradient with MPI:", x.grad) + + print() + x = torch.tensor(3., requires_grad=True) + y = 2 * x + l = y * 10 + k = 3 * l + k.backward() + print("Gradient in single process:", x.grad) + print('-' * 50) + elif dist.get_rank() == 1: + buffer = torch.tensor(0., requires_grad=True) + y, _ = distops.recv(buffer, src=0) + + l = y * 10 + + connector = distops.send(l, dst=0) + connector.backward(torch.tensor([])) + + +if __name__ == '__main__': + dist.init_process_group('mpi') + + print(f'I am {dist.get_rank()}') + dist.barrier() + if dist.get_rank() == 0: + print('-' * 50) + + if dist.get_rank() == 0: + print("EXTRA COLLECTIVES") + + test_reduce_scatter() + + if dist.get_rank() == 0: + print('-' * 50) + + test_send_recv() + + test_broadcast() + + test_gather() + + test_scatter() + + test_all_gather() diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/distributed.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..e5d6e4cc0280fc07fce37796949bba9d0cc832c0 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/distributed.py @@ -0,0 +1,170 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +# from https://github.com/FoundationVision/LlamaGen/blob/main/utils/distributed.py +import os +import sys +import glob +import torch +import subprocess +import torch.distributed as dist +import datetime +import logging + +from infinity.models.videovae.utils.misc import rank_zero_only, COLOR_BLUE, COLOR_RESET + +from torch.distributed.fsdp.wrap import ModuleWrapPolicy +from torch.distributed.fsdp import ( + FullyShardedDataParallel as FSDP, + ShardingStrategy, + MixedPrecision, +) +from infinity.models.videovae.models.cvivit_vqgan import CViViT_Decoder, CViViT_Encoder + + +def setup_for_distributed(is_master, logging_dir=""): + """ + This function disables printing when not in master process and + redirects stdout to log_out.txt and stderr to log_err.txt. + """ + import builtins as __builtin__ + + class Logger(logging.StreamHandler): + def __init__(self, stream, file): + super().__init__(stream) + self.file = file + + def emit(self, record): + try: + msg = self.format(record) + stream = self.stream + fs = "%s\n" + + # Stream to the original stream and then flush + stream.write(fs % msg) + stream.flush() + + # Stream to the file and then flush + self.file.write(fs % msg) + self.file.flush() + except Exception as e: + self.handleError(record) + + def isatty(self): + # Mimic the isatty method usually found in file-like objects + return self.stream.isatty() + + # print rank 0 only + builtin_print = __builtin__.print + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + __builtin__.print = print + + if is_master: + os.makedirs(logging_dir, exist_ok=True) + existing_logs = glob.glob(os.path.join(logging_dir, 'log_out_*.txt')) + log_numbers = [int(log.split('.txt')[0].split('_')[-1]) for log in existing_logs] + next_log_number = max(log_numbers) + 1 if log_numbers else 1 + + log_out_path = os.path.join(logging_dir, f'log_out_{next_log_number}.txt') + log_err_path = os.path.join(logging_dir, f'log_err_{next_log_number}.txt') + + logger_stdout = Logger(sys.stdout, open(log_out_path, 'w')) + logger_stderr = Logger(sys.stderr, open(log_err_path, 'w')) + logging.basicConfig(level=logging.DEBUG, handlers=[logger_stdout, logger_stderr]) + + print(f"{COLOR_BLUE}stdout will be written to {log_out_path}{COLOR_RESET}") + print(f"{COLOR_BLUE}stderr will be written to {log_err_path}{COLOR_RESET}") + +def init_distributed_mode(args, timeout_minutes=15): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + args.dist_url = 'env://' + os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count()) + elif 'SLURM_PROCID' in os.environ: + proc_id = int(os.environ['SLURM_PROCID']) + ntasks = int(os.environ['SLURM_NTASKS']) + node_list = os.environ['SLURM_NODELIST'] + num_gpus = torch.cuda.device_count() + addr = subprocess.getoutput( + 'scontrol show hostname {} | head -n1'.format(node_list)) + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', '29500') + os.environ['MASTER_ADDR'] = addr + os.environ['WORLD_SIZE'] = str(ntasks) + os.environ['RANK'] = str(proc_id) + os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) + os.environ['LOCAL_SIZE'] = str(num_gpus) + args.dist_url = 'env://' + args.world_size = ntasks + args.rank = proc_id + args.gpu = proc_id % num_gpus + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank, + timeout=datetime.timedelta(seconds=timeout_minutes * 60) + ) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0, args.default_root_dir) + +def _FSDP(model: torch.nn.Module, device, zero) -> FSDP: + model = FSDP( + model, + auto_wrap_policy=ModuleWrapPolicy([CViViT_Encoder, CViViT_Decoder]), + device_id=device, + sharding_strategy={1:ShardingStrategy.HYBRID_SHARD, 2:ShardingStrategy.SHARD_GRAD_OP, 3:ShardingStrategy.FULL_SHARD}.get(zero), + mixed_precision=MixedPrecision( + param_dtype=torch.float, + reduce_dtype=torch.float, + buffer_dtype=torch.float, + ), + sync_module_states=True, + limit_all_gathers=True, + use_orig_params=True, + ) + torch.cuda.synchronize() + return model + + +def reduce_losses(loss_dict, dst=0): + loss_names = list(loss_dict.keys()) + loss_tensor = torch.stack([loss_dict[name] for name in loss_names]) + + dist.reduce(loss_tensor, dst=dst, op=dist.ReduceOp.SUM) + # Only average the loss values on the destination rank + if dist.get_rank() == dst: + loss_tensor /= dist.get_world_size() + averaged_losses = {name: loss_tensor[i].item() for i, name in enumerate(loss_names)} + else: + averaged_losses = {name: None for name in loss_names} + + return averaged_losses + +@rank_zero_only +def average_losses(loss_dict_list): + sum_dict = {} + count_dict = {} + for loss_dict in loss_dict_list: + for key, value in loss_dict.items(): + if key in sum_dict: + sum_dict[key] += value + count_dict[key] += 1 + else: + sum_dict[key] = value + count_dict[key] = 1 + + avg_dict = {key: sum_dict[key] / count_dict[key] for key in sum_dict} + return avg_dict diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/dynamic_resolution.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/dynamic_resolution.py new file mode 100644 index 0000000000000000000000000000000000000000..9745a2cbdb6e93275621da77591c8d5a54946426 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/dynamic_resolution.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import json +import numpy as np +import tqdm + +vae_stride = 16 +ratio2hws = { + 1.000: [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16),(20,20),(24,24),(32,32),(40,40),(48,48),(64,64),(80,80),(96,96),(128,128)], + 1.250: [(1,1),(2,2),(3,3),(5,4),(10,8),(15,12),(20,16),(25,20),(30,24),(35,28),(45,36),(55,44),(70,56),(90,72),(110,88),(140,112)], + 1.333: [(1,1),(2,2),(4,3),(8,6),(12,9),(16,12),(20,15),(24,18),(28,21),(36,27),(48,36),(60,45),(72,54),(96,72),(120,90),(144,108)], + 1.500: [(1,1),(2,2),(3,2),(6,4),(9,6),(15,10),(21,14),(27,18),(33,22),(39,26),(48,32),(63,42),(78,52),(96,64),(126,84),(156,104)], + 1.750: [(1,1),(2,2),(3,3),(7,4),(11,6),(14,8),(21,12),(28,16),(35,20),(42,24),(56,32),(70,40),(84,48),(112,64),(140,80),(168,96)], + 2.000: [(1,1),(2,2),(4,2),(6,3),(10,5),(16,8),(22,11),(30,15),(38,19),(46,23),(60,30),(74,37),(90,45),(120,60),(148,74),(180,90)], + 2.500: [(1,1),(2,2),(5,2),(10,4),(15,6),(20,8),(25,10),(30,12),(40,16),(50,20),(65,26),(80,32),(100,40),(130,52),(160,64),(200,80)], + 3.000: [(1,1),(2,2),(6,2),(9,3),(15,5),(21,7),(27,9),(36,12),(45,15),(54,18),(72,24),(90,30),(111,37),(144,48),(180,60),(222,74)], +} +full_ratio2hws = {} +for ratio, hws in ratio2hws.items(): + full_ratio2hws[ratio] = hws + full_ratio2hws[int(1/ratio*1000)/1000] = [(item[1], item[0]) for item in hws] + +dynamic_resolution_h_w = {} +predefined_HW_Scales_dynamic = {} +aspect_ratio_scale_list = [] +bs_dict = {7: 8, 10: 4, 13: 1, 16: 1} # 256x256: batch=8, 512x512: batch=4, 1024x1024: batch=1 (bs=1 avoid OOM) +for ratio in full_ratio2hws: + dynamic_resolution_h_w[ratio] ={} + for ind, leng in enumerate([7, 10, 13, 16]): + h, w = full_ratio2hws[ratio][leng-1][0], full_ratio2hws[ratio][leng-1][1] # feature map size + pixel = (h * vae_stride, w * vae_stride) # The original image (H, W) + dynamic_resolution_h_w[ratio][pixel[1]] = { + 'pixel': pixel, + 'scales': full_ratio2hws[ratio][:leng] + } # W as key + predefined_HW_Scales_dynamic[(h, w)] = full_ratio2hws[ratio][:leng] + # deal with aspect_ratio_scale_list + info_dict = {"ratio": ratio, "h": h * vae_stride, "w": w * vae_stride, "bs": bs_dict[leng]} + aspect_ratio_scale_list.append(info_dict) diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/dynamic_resolution_two_pyramid.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/dynamic_resolution_two_pyramid.py new file mode 100644 index 0000000000000000000000000000000000000000..0d1e293e67c7f617a0b3920517abff54062617a6 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/dynamic_resolution_two_pyramid.py @@ -0,0 +1,103 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import math + +import numpy as np + +video_frames = 97 +vae_stride = 16 +compressed_frames = video_frames // 4 + 1 + +def append_dummy_t(ratio2hws): + for key in ratio2hws: + for i in range(len(ratio2hws[key])): + h, w = ratio2hws[key][i] + ratio2hws[key][i] = (1, h, w) + return ratio2hws + +def get_full_ratio2hws(ratio2hws, total_pixels2scales): + full_ratio2hws = {} + for ratio, hws in ratio2hws.items(): + real_ratio = hws[-1][1] / hws[-1][2] + full_ratio2hws[int(real_ratio*1000)/1000] = hws + if ratio != 1.000: + full_ratio2hws[int(1/real_ratio*1000)/1000] = [(item[0], item[2], item[1]) for item in hws] + + dynamic_resolution_h_w = {} + for ratio in full_ratio2hws: + dynamic_resolution_h_w[ratio] = {} + for _, scales_num in total_pixels2scales.items(): + h, w = full_ratio2hws[ratio][scales_num-1][1], full_ratio2hws[ratio][scales_num-1][2] + # pixel = (h * vae_stride, w * vae_stride) + scales = full_ratio2hws[ratio][:scales_num] + dynamic_resolution_h_w[ratio][(h, w)] = scales + return dynamic_resolution_h_w + +# ratio2hws = { +# 1.000: [(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(10,10),(12,12),(16,16),(24,24),(32,32),(48,48),(60,60),(64,64)], +# 1.250: [(1,1),(2,2),(3,3),(4,3),(5,4),(6,5),(7,5),(8,6),(10,8),(15,12),(20,16),(30,24),(35,28),(45,36),(66,52),(70,56)], +# 1.333: [(1,1),(2,2),(3,2),(4,3),(5,4),(6,5),(7,5),(8,6),(12,9),(16,12),(20,15),(28,21),(36,27),(48,36),(68,50),(72,54)], +# 1.500: [(1,1),(2,2),(3,2),(4,3),(5,3),(6,4),(7,4),(8,6),(12,8),(15,10),(21,14),(33,22),(39,26),(48,32),(72,48),(78,52)], +# 1.750: [(1,1),(2,2),(3,3),(4,3),(5,3),(6,4),(7,4),(8,5),(12,7),(14,8),(21,12),(32,18),(42,24),(54,30),(80,45),(84,48)], +# 2.000: [(1,1),(2,2),(3,2),(4,2),(5,3),(6,3),(7,4),(8,4),(12,6),(16,8),(22,11),(38,19),(46,23),(60,30),(82,41),(90,45)], +# 2.500: [(1,1),(2,2),(3,2),(4,2),(5,2),(7,3),(8,3),(10,4),(15,6),(20,8),(25,10),(40,16),(50,20),(65,26),(90,36),(100,40)], +# 3.000: [(1,1),(2,2),(3,2),(4,2),(5,2),(6,2),(8,3),(9,3),(15,5),(21,7),(27,9),(45,15),(54,18),(72,24),(96,32),(111,37)], +# } +# total_pixels2scales = { +# '0.06M': 11, +# '0.15M': 13, +# '0.25M': 13, +# '0.40M': 14, +# '0.90M': 15, +# '1M': 16, +# } + +def get_ratio2hws_video_v2(): + ratio2hws_video_common_v2 = {} + for h_div_w in [1, 100/116, 3/4, 2/3, 9/16, 1/2, 2/5, 1/3]: + scale_schedule = [] + # 48*48 is 480p, 60*60 is 720p + # for scale in list(range(1,1+16)) + [20, 24, 30, 40]: + for scale in [1,2,3,4,5,6,7,8,10,12,16] + [24, 32, 40, 48, 60]: + area = scale * scale + pw_float = math.sqrt(area / h_div_w) + ph_float = pw_float * h_div_w + ph, pw = int(np.round(ph_float)), int(np.round(pw_float)) + scale_schedule.append((ph, pw)) + ratio2hws_video_common_v2[h_div_w] = scale_schedule + total_pixels2scales = { + '0.06M': 11, + '0.15M': 13, + '0.40M': 14, + '0.60M': 15, + '0.90M': 16, + } + return ratio2hws_video_common_v2, total_pixels2scales + +ratio2hws, total_pixels2scales = get_ratio2hws_video_v2() +ratio2hws = append_dummy_t(ratio2hws) +dynamic_resolution_h_w = get_full_ratio2hws(ratio2hws, total_pixels2scales) +dynamic_resolution_thw = {} +for ratio in dynamic_resolution_h_w: + for (h, w) in dynamic_resolution_h_w[ratio]: + image_scale_schedule = dynamic_resolution_h_w[ratio][(h, w)] + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + firstframe_scalecnt = len(image_scale_schedule) + # if compressed_frames > 1: + # scale_schedule = dynamic_resolution_h_w[ratio][pn]['scales'] + # # predefined_t = np.linspace(1, compressed_frames - 1, len(scale_schedule)) + # predefined_t = np.linspace(1, compressed_frames - 1, total_pixels2scales['0.06M']-1).tolist() + [compressed_frames - 1] * (len(scale_schedule)-total_pixels2scales['0.06M']+1) + # spatial_time_schedule.extend([(min(int(np.round(predefined_t[i])), compressed_frames - 1), h, w) for i, (_, h, w) in enumerate(scale_schedule)]) + dynamic_resolution_thw[(h, w)] = {} + dynamic_resolution_thw[(h, w)]['scales'] = spatial_time_schedule + dynamic_resolution_thw[(h, w)]['tower_split_index'] = firstframe_scalecnt + +# print(dynamic_resolution_thw) + +if __name__ == '__main__': + ratio2hws_video_common_v2, total_pixels2scales = get_ratio2hws_video_v2() + for h_div_w in ratio2hws_video_common_v2: + print(h_div_w, ratio2hws_video_common_v2[h_div_w][10]) + \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/ema.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..442265ab6a499dacbb78e01163e126dd10e576d4 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/ema.py @@ -0,0 +1,24 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +from collections import OrderedDict + +@torch.no_grad() +def update_ema(ema_model, model, decay=0.9999): + """ + Step the EMA model towards the current model. + """ + ema_params = OrderedDict(ema_model.named_parameters()) + model_params = OrderedDict(model.named_parameters()) + + for name, param in model_params.items(): + # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed + ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay) + + +def requires_grad(model, flag=True): + """ + Set requires_grad flag for all parameters in a model. + """ + for p in model.parameters(): + p.requires_grad = flag \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/fs.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/fs.py new file mode 100644 index 0000000000000000000000000000000000000000..88a29fe832820942f76ee532b050405f89b89b78 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/fs.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from typing import Tuple +import tempfile +import os +import fsspec + +TMP_DIR = None + + +def get_fsspec(path: str): + def get_protocol(path: str) -> Tuple[fsspec.spec.AbstractFileSystem, str]: + return fsspec.core.url_to_fs(path) + + if isinstance(path, str): + return get_protocol(path) + + # unkown path type default to local + return fsspec.filesystem("local"), path + + +def get_temp_dir(): + global TMP_DIR + if TMP_DIR: + return TMP_DIR + TMP_DIR = tempfile.TemporaryDirectory() + return TMP_DIR + + +def retrieve_local_path(path: str, worker_id): + local_path = os.path.join("/dev/shm/", get_temp_dir().name.lstrip("/"), str(worker_id), path.lstrip("/")) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + return local_path diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/init_models.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/init_models.py new file mode 100644 index 0000000000000000000000000000000000000000..1e267508f5b4517e5d27691da0b3e93350f7b60d --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/init_models.py @@ -0,0 +1,388 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from infinity.models.videovae.utils.misc import is_torch_optim_sch + + +def inflate_gen(state_dict, temporal_patch_size, spatial_patch_size, strategy="average", inflation_pe=False): + new_state_dict = state_dict.copy() + + pe_image0_w = state_dict["encoder.to_patch_emb_first_frame.1.weight"] # image_channel * patch_width * patch_height + pe_image0_b = state_dict["encoder.to_patch_emb_first_frame.1.bias"] # image_channel * patch_width * patch_height + pe_image1_w = state_dict["encoder.to_patch_emb_first_frame.2.weight"] # image_channel * patch_width * patch_height, dim + pe_image1_b = state_dict["encoder.to_patch_emb_first_frame.2.bias"] # image_channel * patch_width * patch_height + pe_image2_w = state_dict["encoder.to_patch_emb_first_frame.3.weight"] # image_channel * patch_width * patch_height + pe_image2_b = state_dict["encoder.to_patch_emb_first_frame.3.bias"] # image_channel * patch_width * patch_height + + pd_image0_w = state_dict["decoder.to_pixels_first_frame.0.weight"] # dim, image_channel * patch_width * patch_height + pd_image0_b = state_dict["decoder.to_pixels_first_frame.0.bias"] # image_channel * patch_width * patch_height + + pe_video0_w = state_dict["encoder.to_patch_emb.1.weight"] + + old_patch_size = int(math.sqrt(pe_image0_w.shape[0] // 3)) + old_patch_size_temporal = pe_video0_w.shape[0] // (3 * old_patch_size * old_patch_size) + + if old_patch_size != spatial_patch_size or old_patch_size_temporal != temporal_patch_size: + if not inflation_pe: + del new_state_dict["encoder.to_patch_emb_first_frame.1.weight"] + del new_state_dict["encoder.to_patch_emb_first_frame.1.bias"] + del new_state_dict["encoder.to_patch_emb_first_frame.2.weight"] + + del new_state_dict["decoder.to_pixels_first_frame.0.weight"] + del new_state_dict["decoder.to_pixels_first_frame.0.bias"] + + del new_state_dict["encoder.to_patch_emb.1.weight"] + del new_state_dict["encoder.to_patch_emb.1.bias"] + del new_state_dict["encoder.to_patch_emb.2.weight"] + + del new_state_dict["decoder.to_pixels.0.weight"] + del new_state_dict["decoder.to_pixels.0.bias"] + + return new_state_dict + + + print(f"Inflate the patch embedding size from {old_patch_size_temporal}x{old_patch_size}x{old_patch_size} to {temporal_patch_size}x{spatial_patch_size}x{spatial_patch_size}.") + pe_image0_w = F.interpolate(pe_image0_w.unsqueeze(0).unsqueeze(0), size=(3 * spatial_patch_size * spatial_patch_size)).squeeze(0).squeeze(0) + pe_image0_b = F.interpolate(pe_image0_b.unsqueeze(0).unsqueeze(0), size=(3 * spatial_patch_size * spatial_patch_size)).squeeze(0).squeeze(0) + pe_image1_w = F.interpolate(pe_image1_w.unsqueeze(0), size=(3 * spatial_patch_size * spatial_patch_size)).squeeze(0) + + new_state_dict["encoder.to_patch_emb_first_frame.1.weight"] = pe_image0_w + new_state_dict["encoder.to_patch_emb_first_frame.1.bias"] = pe_image0_b + new_state_dict["encoder.to_patch_emb_first_frame.2.weight"] = pe_image1_w + + pd_image0_w = F.interpolate(pd_image0_w.permute(1, 0).unsqueeze(0), size=(3 * spatial_patch_size * spatial_patch_size)).squeeze(0).permute(1, 0) + pd_image0_b = F.interpolate(pd_image0_b.unsqueeze(0).unsqueeze(0), size=(3 * spatial_patch_size * spatial_patch_size)).squeeze(0).squeeze(0) + + new_state_dict["decoder.to_pixels_first_frame.0.weight"] = pd_image0_w + new_state_dict["decoder.to_pixels_first_frame.0.bias"] = pd_image0_b + + pe_video0_w = state_dict["encoder.to_patch_emb.1.weight"] + pe_video0_b = state_dict["encoder.to_patch_emb.1.bias"] + pe_video1_w = state_dict["encoder.to_patch_emb.2.weight"] + + pe_video0_w = F.interpolate(pe_video0_w.unsqueeze(0).unsqueeze(0), size=(3 * temporal_patch_size * spatial_patch_size * spatial_patch_size)).squeeze(0).squeeze(0) + pe_video0_b = F.interpolate(pe_video0_b.unsqueeze(0).unsqueeze(0), size=(3 * temporal_patch_size* spatial_patch_size * spatial_patch_size)).squeeze(0).squeeze(0) + pe_video1_w = F.interpolate(pe_video1_w.unsqueeze(0), size=(3 * temporal_patch_size * spatial_patch_size * spatial_patch_size)).squeeze(0) + + pd_video0_w = state_dict["decoder.to_pixels.0.weight"] + pd_video0_b = state_dict["decoder.to_pixels.0.bias"] + + pd_video0_w = F.interpolate(pd_image0_w.permute(1, 0).unsqueeze(0), size=(3 * temporal_patch_size * spatial_patch_size * spatial_patch_size)).squeeze(0).permute(1, 0) + pd_video0_b = F.interpolate(pd_image0_b.unsqueeze(0).unsqueeze(0), size=(3 * temporal_patch_size * spatial_patch_size * spatial_patch_size)).squeeze(0).squeeze(0) + + new_state_dict["encoder.to_patch_emb.1.weight"] = pe_video0_w + new_state_dict["encoder.to_patch_emb.1.bias"] = pe_video0_b + new_state_dict["encoder.to_patch_emb.2.weight"] = pe_video1_w + + new_state_dict["decoder.to_pixels.0.weight"] = pd_video0_w + new_state_dict["decoder.to_pixels.0.bias"] = pd_video0_b + + return new_state_dict + + + if strategy == "average": + pe_video0_w = torch.cat([pe_image0_w/temporal_patch_size] * temporal_patch_size) + pe_video0_b = torch.cat([pe_image0_b/temporal_patch_size] * temporal_patch_size) + + pe_video1_w = torch.cat([pe_image1_w/temporal_patch_size] * temporal_patch_size, dim=-1) + pe_video1_b = pe_image1_b # torch.cat([pe_image1_b/temporal_patch_size] * temporal_patch_size) + + pe_video2_w = pe_image2_w # torch.cat([pe_image2_w/temporal_patch_size] * temporal_patch_size) + pe_video2_b = pe_image2_b # torch.cat([pe_image2_b/temporal_patch_size] * temporal_patch_size) + + elif strategy == "first": + pe_video0_w = torch.cat([pe_image0_w] + [torch.zeros_like(pe_image0_w, dtype=pe_image0_w.dtype)] * (temporal_patch_size - 1)) + pe_video0_b = torch.cat([pe_image0_b] + [torch.zeros_like(pe_image0_b, dtype=pe_image0_b.dtype)] * (temporal_patch_size - 1)) + + pe_video1_w = torch.cat([pe_image1_w] + [torch.zeros_like(pe_image1_w, dtype=pe_image1_w.dtype)] * (temporal_patch_size - 1), dim=-1) + pe_video1_b = pe_image1_b # torch.cat([pe_image1_b] + [torch.zeros_like(pe_image1_b, dtype=pe_image1_b.dtype)] * (temporal_patch_size - 1)) + + pe_video2_w = pe_image2_w # torch.cat([pe_image2_w] + [torch.zeros_like(pe_image2_w, dtype=pe_image2_w.dtype)] * (temporal_patch_size - 1)) + pe_video2_b = pe_image2_b # torch.cat([pe_image2_b] + [torch.zeros_like(pe_image2_b, dtype=pe_image2_b.dtype)] * (temporal_patch_size - 1)) + + + else: + raise NotImplementedError + + + new_state_dict["encoder.to_patch_emb.1.weight"] = pe_video0_w + new_state_dict["encoder.to_patch_emb.1.bias"] = pe_video0_b + + new_state_dict["encoder.to_patch_emb.2.weight"] = pe_video1_w + new_state_dict["encoder.to_patch_emb.2.bias"] = pe_video1_b + + new_state_dict["encoder.to_patch_emb.3.weight"] = pe_video2_w + new_state_dict["encoder.to_patch_emb.3.bias"] = pe_video2_b + + + if strategy == "average": + pd_video0_w = torch.cat([pd_image0_w/temporal_patch_size] * temporal_patch_size) + pd_video0_b = torch.cat([pd_image0_b/temporal_patch_size] * temporal_patch_size) + + elif strategy == "first": + pd_video0_w = torch.cat([pd_image0_w] + [torch.zeros_like(pd_image0_w, dtype=pd_image0_w.dtype)] * (temporal_patch_size - 1)) + pd_video0_b = torch.cat([pd_image0_b] + [torch.zeros_like(pd_image0_b, dtype=pd_image0_b.dtype)] * (temporal_patch_size - 1)) + + else: + raise NotImplementedError + + + new_state_dict["decoder.to_pixels.0.weight"] = pd_video0_w + new_state_dict["decoder.to_pixels.0.bias"] = pd_video0_b + + return new_state_dict + + +def inflate_dis(state_dict, strategy="center"): + print("#" * 50) + print(f"Initialize the video discriminator with {strategy}.") + print("#" * 50) + idis_weights = {k: v for k, v in state_dict.items() if "image_discriminator" in k} + vids_weights = {k: v for k, v in state_dict.items() if "video_discriminator" in k} + + new_state_dict = state_dict.copy() + for k in vids_weights.keys(): + del new_state_dict[k] + + + for k in idis_weights.keys(): + new_k = "video_discriminator" + k[len("image_discriminator"):] + if "weight" in k and new_state_dict[k].ndim == 4: + old_weight = state_dict[k] + if strategy == "average": + new_weight = old_weight.unsqueeze(2).repeat(1, 1, 4, 1, 1) / 4 + elif strategy == "center": + new_weight_ = old_weight# .unsqueeze(2) # O I 1 K K + new_weight = torch.zeros((new_weight_.size(0), new_weight_.size(1), 4, new_weight_.size(2), new_weight_.size(3)), dtype=new_weight_.dtype) + new_weight[:, :, 1] = new_weight_ + + elif strategy == "first": + new_weight_ = old_weight# .unsqueeze(2) + new_weight = torch.zeros((new_weight_.size(0), new_weight_.size(1), 4, new_weight_.size(2), new_weight_.size(3)), dtype=new_weight_.dtype) + new_weight[:, :, 0] = new_weight_ + + elif strategy == "last": + new_weight_ = old_weight# .unsqueeze(2) + new_weight = torch.zeros((new_weight_.size(0), new_weight_.size(1), 4, new_weight_.size(2), new_weight_.size(3)), dtype=new_weight_.dtype) + new_weight[:, :, -1] = new_weight_ + else: + raise NotImplementedError + + new_state_dict[new_k] = new_weight + + elif "bias" in k: + new_state_dict[new_k] = state_dict[k] + else: + new_state_dict[new_k] = state_dict[k] + + + return new_state_dict + +def load_unstrictly(state_dict, model, loaded_keys=[]): + missing_keys = [] + for name, param in model.named_parameters(): + if name in state_dict: + try: + param.data.copy_(state_dict[name]) + except: + # print(f"{name} mismatch: param {name}, shape {param.data.shape}, state_dict shape {state_dict[name].shape}") + missing_keys.append(name) + elif name not in loaded_keys: + missing_keys.append(name) + return model, missing_keys + +def init_vae_only(state_dict, vae): + vae, missing_keys = load_unstrictly(state_dict, vae) + print(f"missing keys in loading vae: {[key for key in missing_keys if not key.startswith('flux')]}") + return vae + +def init_image_disc(state_dict, image_disc, args): + if args.no_init_idis or args.init_idis == "no": + state_dict = {} + else: + state_dict = state_dict["image_disc"] + # load nn.GroupNorm to Normalize class + delete_keys = [] + loaded_keys = [] + model = image_disc + for key in state_dict: + if key.endswith(".weight"): + norm_key = key.replace(".weight", ".norm.weight") + if norm_key and norm_key in model.state_dict(): + model.state_dict()[norm_key].copy_(state_dict[key]) + delete_keys.append(key) + loaded_keys.append(norm_key) + if key.endswith(".bias"): + norm_key = key.replace(".bias", ".norm.bias") + if norm_key and norm_key in model.state_dict(): + model.state_dict()[norm_key].copy_(state_dict[key]) + delete_keys.append(key) + loaded_keys.append(norm_key) + for key in delete_keys: + del state_dict[key] + msg = image_disc.load_state_dict(state_dict, strict=False) + print(f"image disc missing: {[key for key in msg.missing_keys if key not in loaded_keys]}") + print(f"image disc unexpected: {msg.unexpected_keys}") + return image_disc + +def init_video_disc(state_dict, video_disc, args): + # init video disc + if args.init_vdis == "no": + video_disc_state_dict = {} + elif args.init_vdis == "keep": + video_disc_state_dict = state_dict["video_disc"] + else: + video_disc_state_dict = inflate_dis(state_dict["video_disc"], strategy=args.init_vdis) + msg = video_disc.load_state_dict(video_disc_state_dict, strict=False) + print(f"video disc missing: {msg.missing_keys}") + print(f"video disc unexpected: {msg.unexpected_keys}") + return video_disc + +def init_vit_from_image(state_dict, vae, image_disc, video_disc, args): + if args.init_vgen == "no": + vae_state_dict = state_dict["vae"] + del vae_state_dict["encoder.to_patch_emb.1.weight"] + del vae_state_dict["encoder.to_patch_emb.1.bias"] + del vae_state_dict["encoder.to_patch_emb.2.weight"] + del vae_state_dict["encoder.to_patch_emb.2.bias"] + del vae_state_dict["encoder.to_patch_emb.3.weight"] + del vae_state_dict["encoder.to_patch_emb.3.bias"] + + del vae_state_dict["decoder.to_pixels.0.weight"] + del vae_state_dict["decoder.to_pixels.0.bias"] + vae_state_dict = state_dict["vae"] + + elif args.init_vgen == "keep": + vae_state_dict = state_dict["vae"] + else: + vae_state_dict = inflate_gen(state_dict["vae"], temporal_patch_size=args.temporal_patch_size, spatial_patch_size=args.patch_size, strategy=args.init_vgen, inflation_pe=args.inflation_pe) + + if args.vq_to_vae: + del vae_state_dict["pre_vq_conv.1.weight"] + del vae_state_dict["pre_vq_conv.1.bias"] + + msg = vae.load_state_dict(vae_state_dict, strict=False) + print(f"vae missing: {msg.missing_keys}") + print(f"vae unexpected: {msg.unexpected_keys}") + + image_disc = init_image_disc(state_dict, image_disc, args) + # video_disc = init_video_disc(state_dict, image_disc, args) # random init video discriminator + + return vae, image_disc, video_disc + +def load_cnn(model, state_dict, prefix, expand=False, use_linear=False): + delete_keys = [] + loaded_keys = [] + for key in state_dict: + if key.startswith(prefix): + _key = key[len(prefix):] + if _key in model.state_dict(): + # load nn.Conv2d or nn.Linear to nn.Linear + if use_linear and (".q.weight" in key or ".k.weight" in key or ".v.weight" in key or ".proj_out.weight" in key): + load_weights = state_dict[key].squeeze() + elif _key.endswith(".conv.weight") and expand: + if model.state_dict()[_key].shape == state_dict[key].shape: + # 2D cnn to 2D cnn + load_weights = state_dict[key] + else: + # 2D cnn to 3D cnn + _expand_dim = model.state_dict()[_key].shape[2] + load_weights = state_dict[key].unsqueeze(2).repeat(1, 1, _expand_dim, 1, 1) + load_weights = load_weights / _expand_dim # normalize across expand dim + else: + load_weights = state_dict[key] + model.state_dict()[_key].copy_(load_weights) + delete_keys.append(key) + loaded_keys.append(prefix+_key) + # load nn.Conv2d to Conv class + conv_list = ["conv"] if use_linear else ["conv", ".q.", ".k.", ".v.", ".proj_out.", ".nin_shortcut."] + if any(k in _key for k in conv_list): + if _key.endswith(".weight"): + conv_key = _key.replace(".weight", ".conv.weight") + if conv_key and conv_key in model.state_dict(): + if model.state_dict()[conv_key].shape == state_dict[key].shape: + # 2D cnn to 2D cnn + load_weights = state_dict[key] + else: + # 2D cnn to 3D cnn + _expand_dim = model.state_dict()[conv_key].shape[2] + load_weights = state_dict[key].unsqueeze(2).repeat(1, 1, _expand_dim, 1, 1) + load_weights = load_weights / _expand_dim # normalize across expand dim + model.state_dict()[conv_key].copy_(load_weights) + delete_keys.append(key) + loaded_keys.append(prefix+conv_key) + if _key.endswith(".bias"): + conv_key = _key.replace(".bias", ".conv.bias") + if conv_key and conv_key in model.state_dict(): + model.state_dict()[conv_key].copy_(state_dict[key]) + delete_keys.append(key) + loaded_keys.append(prefix+conv_key) + # load nn.GroupNorm to Normalize class + if "norm" in _key: + if _key.endswith(".weight"): + norm_key = _key.replace(".weight", ".norm.weight") + if norm_key and norm_key in model.state_dict(): + model.state_dict()[norm_key].copy_(state_dict[key]) + delete_keys.append(key) + loaded_keys.append(prefix+norm_key) + if _key.endswith(".bias"): + norm_key = _key.replace(".bias", ".norm.bias") + if norm_key and norm_key in model.state_dict(): + model.state_dict()[norm_key].copy_(state_dict[key]) + delete_keys.append(key) + loaded_keys.append(prefix+norm_key) + + for key in delete_keys: + del state_dict[key] + + return model, state_dict, loaded_keys + +def init_cnn_from_image(state_dict, vae, image_disc, video_disc, args, expand=False): + vae.encoder, state_dict["vae"], loaded_keys1 = load_cnn(vae.encoder, state_dict["vae"], prefix="encoder.", expand=expand) + vae.decoder, state_dict["vae"], loaded_keys2 = load_cnn(vae.decoder, state_dict["vae"], prefix="decoder.", expand=expand) + loaded_keys = loaded_keys1 + loaded_keys2 + # msg = vae.load_state_dict(state_dict["vae"], strict=False) + # print(f"vae missing: {[key for key in msg.missing_keys if key not in loaded_keys]}") + # print(f"vae unexpected: {msg.unexpected_keys}") + vae, missing_keys = load_unstrictly(state_dict["vae"], vae, loaded_keys) + + if image_disc: + image_disc = init_image_disc(state_dict, image_disc, args) + ### random init video discriminator + # if video_disc: + # video_disc = init_video_disc(state_dict, image_disc, args) + return vae, image_disc, video_disc + +def resume_from_ckpt(state_dict, model_optims, load_optims=True): + all_missing_keys = [] + # load weights first + for k in model_optims: + if model_optims[k] and state_dict[k] and (not is_torch_optim_sch(model_optims[k])) and k in state_dict: + model_optims[k], missing_keys = load_unstrictly(state_dict[k], model_optims[k]) + all_missing_keys += missing_keys + + if len(all_missing_keys) == 0 and load_optims: + print("Loading optimizer states") + for k in model_optims: + if model_optims[k] and state_dict[k] and is_torch_optim_sch(model_optims[k]) and k in state_dict: + model_optims[k].load_state_dict(state_dict[k]) + else: + print(f"missing weights: {all_missing_keys}, load_optims={load_optims}, do not load optimzer states") + return model_optims, state_dict["step"] + +### old version +# def get_last_ckpt(root_dir): +# if not os.path.exists(root_dir): return None, None +# ckpt_files = {} +# for dirpath, dirnames, filenames in os.walk(root_dir): +# for filename in filenames: +# if filename.endswith('.ckpt'): +# num_iter = int(filename.split('-')[1].split('=')[1]) +# ckpt_files[num_iter]=os.path.join(dirpath, filename) +# iter_list = list(ckpt_files.keys()) +# if len(iter_list) == 0: return None, None +# max_iter = max(iter_list) +# return ckpt_files[max_iter], max_iter diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/mfu.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/mfu.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ba0b19fa6f643a777af50183fb26401f533728 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/mfu.py @@ -0,0 +1,181 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import os +import math +from datetime import datetime +from abc import ABC, abstractmethod + +import torch +from torch import nn +import torch.distributed as dist +from torch.nn.modules.conv import _ConvNd +from torch.utils.checkpoint import TorchDispatchMode + + +def get_device_tflops(): + peak_tflops = -1 + arch = torch.cuda.get_device_capability() + if arch[0] == 8 and arch[1] == 0: # A100/A800 + peak_tflops = 312 + elif arch[0] == 9 and arch[1] == 0: # H100/H800 + peak_tflops = 989 + else: + print(f"unknown default tflops of device capability {arch[0]}.{arch[1]}") + return peak_tflops + + +class NullCtx(TorchDispatchMode): + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + return func(*args, **kwargs) + + +class DisableMfu(NullCtx): + def __enter__(self): + super().__enter__() + self.old_flop_enable = Flops.enable + Flops.enable = False + + def __exit__(self, *args, **kwargs): + Flops.enable = self.old_flop_enable + super().__exit__(*args, **kwargs) + + +def context_fn(): + return NullCtx(), DisableMfu() + + +class CustomFlops(ABC): + """ + For functions, + 1. run the func within CustomFlops + 2. implement the hook `flops` + to support register_forward_hook + """ + @abstractmethod + def flops(self, args, kwargs, output) -> dict: + pass + + +def conv_flops_func(module, args, kwargs, output): + return 2 * math.prod(module.kernel_size) * module.in_channels * output.numel() + + +def linear_flops_func(module, args, kwargs, output): + return 2 * module.in_features * output.numel() + + +def layernorm_flops_func(module, args, kwargs, output): + return 4 * output.numel() + + +def groupnorm_flops_func(module, args, kwargs, output): + return 2 * output.numel() + + +def syncbatchnorm_flops_func(module, args, kwargs, output): + return 2 * output.numel() + + +basic_flops_func = { + _ConvNd: conv_flops_func, + nn.Linear: linear_flops_func, + nn.LayerNorm: layernorm_flops_func, + nn.GroupNorm: groupnorm_flops_func, + nn.SyncBatchNorm: syncbatchnorm_flops_func, +} + +@torch._dynamo.disable() +def calculate_flops(module, args, kwargs, output): + flops = 0 + flops_dict = {} + if isinstance(module, CustomFlops): + flops_dict = module.flops(args, kwargs, output) + else: + flops_func = basic_flops_func[module._base_m] + flops_dict = {module.__class__.__name__: flops_func(module, args, kwargs, output)} + + for module_class, module_flops in flops_dict.items(): + if module_class not in Flops.module_flops_dict: + Flops.module_flops_dict[module_class] = module_flops * (3 if module.training else 1) + else: + Flops.module_flops_dict[module_class] += module_flops * (3 if module.training else 1) + + flops = sum(list(flops_dict.values())) + Flops.flops += flops * (3 if module.training else 1) + + +class Flops: + handlers = [] + flops = 0 + enable = True + module_flops_dict = {} + + @staticmethod + def reset(): + tmp = Flops.flops + Flops.flops = 0 + Flops.module_flops_dict = {} + return tmp + + @staticmethod + def _hook(module, args, kwargs, output): + if not Flops.enable: + return + + if module.training and not torch.is_grad_enabled(): + # activation checkpoint mode + return + calculate_flops(module, args, kwargs, output) + + @staticmethod + def _dfs_register_hooks(parent_name: str, cur_m: nn.Module): + for name, m in cur_m.named_children(): + # custom hooks + if isinstance(m, CustomFlops): + assert isinstance(m, nn.Module) + Flops.handlers.append( + m.register_forward_hook(Flops._hook, with_kwargs=True) + ) + continue + # built-in hooks + is_registered = False + for base_m, flops_func in basic_flops_func.items(): + if isinstance(m, base_m): + m._base_m = base_m + Flops.handlers.append( + m.register_forward_hook(Flops._hook, with_kwargs=True) + ) + is_registered = True + break + if not is_registered: + Flops._dfs_register_hooks(parent_name + "." + name, m) + + @staticmethod + def unwrap(self): + for hdl in Flops.handlers: + hdl.remove() + + + +def register_mfu_hook(model): + Flops._dfs_register_hooks("root", model) + + +def get_tflops(): + return Flops.flops / 1e12 + + +def get_tflops_dict(record_iters=1): + tflops_dict = {module: round(flops / record_iters/ 1e12, 3) for module, flops in Flops.module_flops_dict.items()} + return tflops_dict + + +def get_mfu(iter_time): + # compute MFU + ideal_TFLOPS = get_device_tflops() + achieve_TFLOPs = Flops.reset() / 1e12 + mfu = achieve_TFLOPs / iter_time / ideal_TFLOPS + return mfu diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/misc.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..3a0418b7151e6c9947a508b387b0b0e46a436666 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/misc.py @@ -0,0 +1,290 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import torch +import torch.distributed as dist +import imageio +import os +import random + +import math +import numpy as np +from einops import rearrange +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler + +import sys +import pdb as pdb_original +from contextlib import contextmanager + +COLOR_BLUE = "\033[94m" +COLOR_RESET = "\033[0m" +ptdtype = {None: torch.float32, 'fp32': torch.float32, 'bf16': torch.bfloat16} + +def rank_zero_only(fn): + def wrapped_fn(*args, **kwargs): + if not dist.is_initialized() or dist.get_rank() == 0: + return fn(*args, **kwargs) + return wrapped_fn + +@rank_zero_only +def print_gpu_usage(model_name) -> None: + allocated_memory = torch.cuda.memory_allocated() + reserved_memory = torch.cuda.memory_reserved() + print(f"after {model_name} backward Allocated Memory: {allocated_memory}, Reserved Memory: {reserved_memory}") + torch.cuda.empty_cache() + +def seed_everything(seed=0, allow_tf32=True, benchmark=True, deterministic=False): + random.seed(seed) + np.random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = benchmark # default False in torch 2.3.1 + + # See https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + # See https://pytorch.org/docs/stable/notes/randomness.html + torch.use_deterministic_algorithms(deterministic) + + torch.backends.cudnn.allow_tf32 = allow_tf32 # default True in torch 2.3.1 + torch.backends.cuda.matmul.allow_tf32 = allow_tf32 # default True in torch 2.3.1 + +# Function to print model summary in table format +@rank_zero_only +def print_model_summary(models): + # Table headers + print(f"{'Layer Name':<20} {'Param #':<20}") + print("="*40) + total_params = 0 + for model in models: + for name, module in model.named_children(): + params = sum(p.numel() for p in module.parameters()) + total_params += params + params_str = f"{params/1e6:.2f}M" + print(f"{name:<20} {params_str:<20}") + print("="*40) + print(f"Total number of parameters: {total_params/1e6:.2f}M") + +def version_checker(base_version, high_version): + try: + from bytedance.ndtimeline import __version__ + from packaging.version import Version + if Version(__version__) < Version(base_version) or Version(__version__) >= Version(high_version): + raise RuntimeError(f"bytedance.ndtimeline's version should be >={base_version} <{high_version}, but {__version__} found") + except ImportError: + raise RuntimeError(f"bytedance.ndtimeline's version should be >={base_version} <{high_version}") + +def is_torch_optim_sch(obj): + return isinstance(obj, (optim.Optimizer, optim.lr_scheduler.LambdaLR)) + +def rearranged_forward(x, func): + if x.ndim == 4: + x = rearrange(x, "B C H W -> B H W C") + elif x.ndim == 5: + x = rearrange(x, "B C T H W -> B T H W C") + x = func(x) + if x.ndim == 4: + x = rearrange(x, "B H W C -> B C H W") + elif x.ndim == 5: + x = rearrange(x, "B T H W C -> B C T H W") + return x + +def is_dtype_16(data): + return data.dtype == torch.float16 or data.dtype == torch.bfloat16 + +@contextmanager +def set_tf32_flags(flag): + old_matmul_flag = torch.backends.cuda.matmul.allow_tf32 + old_cudnn_flag = torch.backends.cudnn.allow_tf32 + torch.backends.cuda.matmul.allow_tf32 = flag + torch.backends.cudnn.allow_tf32 = flag + try: + yield + finally: + # Restore the original flags + torch.backends.cuda.matmul.allow_tf32 = old_matmul_flag + torch.backends.cudnn.allow_tf32 = old_cudnn_flag + +class ByteNASManager: + bytenas_dir = { + + } + _current_bytenas = None + _username = None + + @classmethod + def set_bytenas(cls, bytenas, username="zhufengda"): + cls._current_bytenas = bytenas + cls._username = username + + @classmethod + def get_work_dir(cls, use_username=True): + if use_username: + username = cls._username + else: + username = "" + base_dir = cls.bytenas_dir[cls._current_bytenas] + return os.path.join(base_dir, username) + + @classmethod + def __call__(cls, rel_path, use_username=True, prefix=""): + return os.path.join(cls.get_work_dir(use_username=use_username), prefix, rel_path) + +bytenas_manager = ByteNASManager() + +def get_last_ckpt(root_dir): + if not os.path.exists(root_dir): return None + ckpt_files = {} + for dirpath, dirnames, filenames in os.walk(root_dir): + for filename in filenames: + if filename.endswith('.ckpt'): + num_iter = int(filename.split('.ckpt')[0].split('_')[-1]) + ckpt_files[num_iter]=os.path.join(dirpath, filename) + iter_list = list(ckpt_files.keys()) + if len(iter_list) == 0: return None + max_iter = max(iter_list) + return ckpt_files[max_iter] + + +# Shifts src_tf dim to dest dim +# i.e. shift_dim(x, 1, -1) would be (b, c, t, h, w) -> (b, t, h, w, c) +def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True): + n_dims = len(x.shape) + if src_dim < 0: + src_dim = n_dims + src_dim + if dest_dim < 0: + dest_dim = n_dims + dest_dim + + assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims + + dims = list(range(n_dims)) + del dims[src_dim] + + permutation = [] + ctr = 0 + for i in range(n_dims): + if i == dest_dim: + permutation.append(src_dim) + else: + permutation.append(dims[ctr]) + ctr += 1 + x = x.permute(permutation) + if make_contiguous: + x = x.contiguous() + return x + + +# reshapes tensor start from dim i (inclusive) +# to dim j (exclusive) to the desired shape +# e.g. if x.shape = (b, thw, c) then +# view_range(x, 1, 2, (t, h, w)) returns +# x of shape (b, t, h, w, c) +def view_range(x, i, j, shape): + shape = tuple(shape) + + n_dims = len(x.shape) + if i < 0: + i = n_dims + i + + if j is None: + j = n_dims + elif j < 0: + j = n_dims + j + + assert 0 <= i < j <= n_dims + + x_shape = x.shape + target_shape = x_shape[:i] + shape + x_shape[j:] + return x.view(target_shape) + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.reshape(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def tensor_slice(x, begin, size): + assert all([b >= 0 for b in begin]) + size = [l - b if s == -1 else s + for s, b, l in zip(size, begin, x.shape)] + assert all([s >= 0 for s in size]) + + slices = [slice(b, b + s) for b, s in zip(begin, size)] + return x[slices] + + +def save_video_grid(video, fname, nrow=None, fps=16): + b, c, t, h, w = video.shape + video = video.permute(0, 2, 3, 4, 1).contiguous() + + video = (video.detach().cpu().numpy() * 255).astype('uint8') + if nrow is None: + nrow = math.ceil(math.sqrt(b)) + ncol = math.ceil(b / nrow) + padding = 1 + video_grid = np.zeros((t, (padding + h) * nrow + padding, + (padding + w) * ncol + padding, c), dtype='uint8') + # print(video_grid.shape) + for i in range(b): + r = i // ncol + c = i % ncol + start_r = (padding + h) * r + start_c = (padding + w) * c + video_grid[:, start_r:start_r + h, start_c:start_c + w] = video[i] + video = [] + for i in range(t): + video.append(video_grid[i]) + imageio.mimsave(fname, video, fps=fps) + # skvideo.io.vwrite(fname, video_grid, inputdict={'-r': '5'}) + # print('saved videos to', fname) + + +def comp_getattr(args, attr_name, default=None): + if hasattr(args, attr_name): + return getattr(args, attr_name) + else: + return default + + +def visualize_tensors(t, name=None, nest=0): + if name is not None: + print(name, "current nest: ", nest) + print("type: ", type(t)) + if 'dict' in str(type(t)): + print(t.keys()) + for k in t.keys(): + if t[k] is None: + print(k, "None") + else: + if 'Tensor' in str(type(t[k])): + print(k, t[k].shape) + elif 'dict' in str(type(t[k])): + print(k, 'dict') + visualize_tensors(t[k], name, nest + 1) + elif 'list' in str(type(t[k])): + print(k, len(t[k])) + visualize_tensors(t[k], name, nest + 1) + elif 'list' in str(type(t)): + print("list length: ", len(t)) + for t2 in t: + visualize_tensors(t2, name, nest + 1) + elif 'Tensor' in str(type(t)): + print(t.shape) + else: + print(t) + return "" diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/nan_detector.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/nan_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..cfe2019be289628d56d67634245b9589f4ab03fe --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/nan_detector.py @@ -0,0 +1,105 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +import logging + +import torch + +logger = logging.getLogger(__name__) +RANK = int(os.environ["RANK"]) if "RANK" in os.environ else 0 + +class NanDetector: + """ + Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name + """ + + def __init__(self, model, forward=True, backward=True): + self.bhooks = [] + self.fhooks = [] + self.forward = forward + self.backward = backward + self.named_parameters = list(model.named_parameters()) + self.reset() + + for name, mod in model.named_modules(): + mod.__module_name = name + self.add_hooks(mod) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + # Dump out all model gnorms to enable better debugging + norm = {} + gradients = {} + for name, param in self.named_parameters: + if param.grad is not None: + grad_norm = torch.norm(param.grad.data, p=2, dtype=torch.float32) + norm[name] = grad_norm.item() + if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any(): + gradients[name] = param.grad.data + if len(gradients) > 0: + logger.info("Detected nan/inf grad norm, dumping norms...") + logger.info(f"norms: {norm}") + logger.info(f"gradients: {gradients}") + + self.close() + + def add_hooks(self, module): + if self.forward: + self.fhooks.append(module.register_forward_hook(self.fhook_fn)) + if self.backward: + self.bhooks.append(module.register_backward_hook(self.bhook_fn)) + + def reset(self): + self.has_printed_f = False + self.has_printed_b = False + + def _detect(self, tensor, name, backward): + err = None + if ( + torch.is_floating_point(tensor) + # single value tensors (like the loss) will not provide much info + and tensor.numel() >= 2 + ): + with torch.no_grad(): + if torch.isnan(tensor).any(): + err = "NaN" + elif torch.isinf(tensor).any(): + err = "Inf" + if err is not None: + err = f"{err} detected in output of {name}, shape: {tensor.shape}, {'backward' if backward else 'forward'}" + return err + + def _apply(self, module, inp, x, backward): + if torch.is_tensor(x): + if isinstance(inp, tuple) and len(inp) > 0: + inp = inp[0] + err = self._detect(x, module.__module_name, backward) + if err is not None: + if torch.is_tensor(inp) and not backward: + err += ( + f" input max: {inp.max().item()}, input min: {inp.min().item()}" + ) + has_printed_attr = "has_printed_b" if backward else "has_printed_f" + logger.warning(f"rank-{RANK}, err_info : {err}") + setattr(self, has_printed_attr, True) + elif isinstance(x, dict): + for v in x.values(): + self._apply(module, inp, v, backward) + elif isinstance(x, list) or isinstance(x, tuple): + for v in x: + self._apply(module, inp, v, backward) + + def fhook_fn(self, module, inp, output): + if not self.has_printed_f: + self._apply(module, inp, output, backward=False) + + def bhook_fn(self, module, inp, output): + if not self.has_printed_b: + self._apply(module, inp, output, backward=True) + + def close(self): + for hook in self.fhooks + self.bhooks: + hook.remove() diff --git a/Meissonic/InfinityStar/infinity/models/videovae/utils/scheduler.py b/Meissonic/InfinityStar/infinity/models/videovae/utils/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..dcc889ce6f07c471fbe2e573ea37423896c7acd1 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/models/videovae/utils/scheduler.py @@ -0,0 +1,15 @@ + +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +def get_lambda(args): + if args.scheduler == "linear": + def lr_lambda(step): + warmup_steps = args.warmup_steps + if step < warmup_steps: + return step / warmup_steps + else: + return 1. + return lr_lambda + else: + raise NotImplementedError diff --git a/Meissonic/InfinityStar/infinity/schedules/__init__.py b/Meissonic/InfinityStar/infinity/schedules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..744db04a40796b4a74ea668709b42d111e00db75 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/schedules/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +def get_encode_decode_func(dynamic_scale_schedule): + if dynamic_scale_schedule == 'infinity_star_interact': + from infinity.schedules.infinity_star_interact import video_encode, video_decode, get_visual_rope_embeds, get_scale_pack_info + elif 'infinity_elegant' in dynamic_scale_schedule: + from infinity.schedules.infinity_elegant import video_encode, video_decode, get_visual_rope_embeds, get_scale_pack_info + else: + raise NotImplementedError(f'dynamic_scale_schedule not implemented: {dynamic_scale_schedule}') + return video_encode, video_decode, get_visual_rope_embeds, get_scale_pack_info diff --git a/Meissonic/InfinityStar/infinity/schedules/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/infinity/schedules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e2407cdb3bfdf5e7a21c79b8e1cecaf5953f7d0 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/schedules/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/schedules/__pycache__/dynamic_resolution.cpython-310.pyc b/Meissonic/InfinityStar/infinity/schedules/__pycache__/dynamic_resolution.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6fb9e265b0ae1266daab5e7617822a3251f5f61 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/schedules/__pycache__/dynamic_resolution.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/schedules/dynamic_resolution.py b/Meissonic/InfinityStar/infinity/schedules/dynamic_resolution.py new file mode 100644 index 0000000000000000000000000000000000000000..20665feaa3fc6bb9c934a8a8a4ec9f3bbc6e366e --- /dev/null +++ b/Meissonic/InfinityStar/infinity/schedules/dynamic_resolution.py @@ -0,0 +1,217 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import json +import math +import copy + +import tqdm +import numpy as np + +vae_stride = 16 +ratios = [1.000, 1.250, 1.333, 1.500, 1.750, 2.000, 2.500, 3.000] + +def get_ratio2hws_video_v2(): + ratio2hws_video_common_v2 = {} + for h_div_w in [1, 100/116, 3/4, 2/3, 9/16, 1/2, 2/5, 1/3]: + scale_schedule = [] + # 40*40 is 480p, 60*60 is 720p + # for scale in list(range(1,1+16)) + [20, 24, 30, 40]: + for scale in [1,2,3,4,5,6,7,8,10,12,16] + [24, 32, 40, 60]: + area = scale * scale + pw_float = math.sqrt(area / h_div_w) + ph_float = pw_float * h_div_w + ph, pw = int(np.round(ph_float)), int(np.round(pw_float)) + scale_schedule.append((ph, pw)) + ratio2hws_video_common_v2[h_div_w] = scale_schedule + total_pixels2scales = { + '0.06M': 11, + '0.25M': 13, + '0.40M': 14, + '0.90M': 15, + } + return ratio2hws_video_common_v2, total_pixels2scales + +def append_dummy_t(ratio2hws): + for key in ratio2hws: + for i in range(len(ratio2hws[key])): + h, w = ratio2hws[key][i] + ratio2hws[key][i] = (1, h, w) + return ratio2hws + +def get_first_full_spatial_size_scale_index(vae_scale_schedule): + for si, (pt, ph, pw) in enumerate(vae_scale_schedule): + if vae_scale_schedule[si][-2:] == vae_scale_schedule[-1][-2:]: + return si + +def get_full_spatial_size_scale_indices(vae_scale_schedule): + full_spatial_size_scale_indices = [] + for si, (pt, ph, pw) in enumerate(vae_scale_schedule): + if vae_scale_schedule[si][-2:] == vae_scale_schedule[-1][-2:]: + full_spatial_size_scale_indices.append(si) + return full_spatial_size_scale_indices + +def repeat_schedule(scale_schedule, repeat_scales_num, times): + new_scale_schedule = [] + for i in range(repeat_scales_num): + new_scale_schedule.extend([scale_schedule[i] for _ in range(times)]) + new_scale_schedule.extend(scale_schedule[repeat_scales_num:]) + return new_scale_schedule + +def get_ratio2hws_pixels2scales(dynamic_scale_schedule, video_frames): + compressed_frames = video_frames // 4 + 1 + if dynamic_scale_schedule == '13_hand_craft': + ratio2hws = { + 1.000: [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16),(20,20),(24,24),(32,32),(40,40),(48,48),(64,64)], + 1.250: [(1,1),(2,2),(3,3),(5,4),(10,8),(15,12),(20,16),(25,20),(30,24),(35,28),(45,36),(55,44),(70,56)], + 1.333: [(1,1),(2,2),(4,3),(8,6),(12,9),(16,12),(20,15),(24,18),(28,21),(36,27),(48,36),(60,45),(72,54)], + 1.500: [(1,1),(2,2),(3,2),(6,4),(9,6),(15,10),(21,14),(27,18),(33,22),(39,26),(48,32),(63,42),(78,52)], + 1.750: [(1,1),(2,2),(3,3),(7,4),(11,6),(14,8),(21,12),(28,16),(35,20),(42,24),(56,32),(70,40),(84,48)], + 2.000: [(1,1),(2,2),(4,2),(6,3),(10,5),(16,8),(22,11),(30,15),(38,19),(46,23),(60,30),(74,37),(90,45)], + 2.500: [(1,1),(2,2),(5,2),(10,4),(15,6),(20,8),(25,10),(30,12),(40,16),(50,20),(65,26),(80,32),(100,40)], + 3.000: [(1,1),(2,2),(6,2),(9,3),(15,5),(21,7),(27,9),(36,12),(45,15),(54,18),(72,24),(90,30),(111,37)], + } + ratio2hws = append_dummy_t(ratio2hws) + total_pixels2scales = { + '0.06M': 7, + '0.25M': 10, + '1M': 13, + } + predefined_t = [1 for _ in range(len(ratio2hws[1.000]))] + dynamic_resolution_h_w = get_full_ratio2hws(ratio2hws, video_frames, total_pixels2scales, predefined_t) + for ratio in dynamic_resolution_h_w: + for pn in dynamic_resolution_h_w[ratio]: + base_scale_schedule = dynamic_resolution_h_w[ratio][pn]['scales'] + ts = np.round(np.linspace(1,compressed_frames,7)) + dynamic_resolution_h_w[ratio][pn]['image_scales'] = base_scale_schedule + if dynamic_scale_schedule == 'infinity_loop_full_time': + dynamic_resolution_h_w[ratio][pn]['video_scales'] = [(compressed_frames, pn[1], pn[2]) for pn in base_scale_schedule] + else: + dynamic_resolution_h_w[ratio][pn]['video_scales'] = [(int(t), pn[1], pn[2]) for (t, pn) in zip(ts, base_scale_schedule)] + del dynamic_resolution_h_w[ratio][pn]['scales'] + elif dynamic_scale_schedule in ['infinity_elegant_clip20frames_v2', 'infinity_star_interact']: + ratio2hws, total_pixels2scales = get_ratio2hws_video_v2() + ratio2hws = append_dummy_t(ratio2hws) + dynamic_resolution_h_w = get_full_ratio2hws(ratio2hws, video_frames, total_pixels2scales, predefined_t=None) + compressed_frames_in_one_clip = 20 + compressed_frames_per_sec = 16 // 4 + duration_resolution = 1 + for ratio in dynamic_resolution_h_w: + for pn in dynamic_resolution_h_w[ratio]: + base_scale_schedule = dynamic_resolution_h_w[ratio][pn]['scales'] + image_scale_schedule = base_scale_schedule + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + assert (compressed_frames - 1) % compressed_frames_in_one_clip == 0 + clips = (compressed_frames - 1) // compressed_frames_in_one_clip + scales_in_one_clip = len(base_scale_schedule) + for _ in range(clips): + spatial_time_schedule.extend([(compressed_frames_in_one_clip, h, w) for _, h, w in base_scale_schedule]) + dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'] = {1: image_scale_schedule} + pt_interval = duration_resolution*compressed_frames_per_sec + for pt in range(1+compressed_frames_per_sec,compressed_frames+1, pt_interval): # duration_resolution is 1s + tmp_clips = 1 + int(np.ceil((pt-1) / compressed_frames_in_one_clip)) + dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'][pt] = spatial_time_schedule[:scales_in_one_clip*tmp_clips] + pt_last_clip = (pt - 1) % compressed_frames_in_one_clip + if pt_last_clip > 0: + for i in range(scales_in_one_clip): + tmp_t, tmp_h, tmp_w = dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'][pt][-i-1] + dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'][pt][-i-1] = (pt_last_clip, tmp_h, tmp_w) + dynamic_resolution_h_w[ratio][pn]['image_scales'] = scales_in_one_clip + dynamic_resolution_h_w[ratio][pn]['scales_in_one_clip'] = scales_in_one_clip + dynamic_resolution_h_w[ratio][pn]['max_video_scales'] = len(dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'][compressed_frames]) + del dynamic_resolution_h_w[ratio][pn]['scales'] + elif dynamic_scale_schedule == 'infinity_star_extract_features': + ratio2hws, total_pixels2scales = get_ratio2hws_video_v2() + ratio2hws = append_dummy_t(ratio2hws) + dynamic_resolution_h_w = get_full_ratio2hws(ratio2hws, video_frames, total_pixels2scales, predefined_t=None) + for ratio in dynamic_resolution_h_w: + for pn in dynamic_resolution_h_w[ratio]: + base_scale_schedule = dynamic_resolution_h_w[ratio][pn]['scales'] + image_scale_schedule = base_scale_schedule + spatial_time_schedule = [] + spatial_time_schedule.extend(image_scale_schedule) + clips = compressed_frames - 1 + dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'] = {} + for pt in range(1,compressed_frames+1, 1): # duration_resolution is 1s + dynamic_resolution_h_w[ratio][pn]['pt2scale_schedule'][pt] = [(pt, h, w) for _, h, w in base_scale_schedule] + dynamic_resolution_h_w[ratio][pn]['image_scales'] = len(base_scale_schedule) + dynamic_resolution_h_w[ratio][pn]['scales_in_one_clip'] = len(base_scale_schedule) + dynamic_resolution_h_w[ratio][pn]['max_video_scales'] = len(base_scale_schedule) + del dynamic_resolution_h_w[ratio][pn]['scales'] + else: + raise ValueError(f'dynamic_scale_schedule={dynamic_scale_schedule} not implemented') + return dynamic_resolution_h_w + + +def get_full_ratio2hws(ratio2hws, video_frames, total_pixels2scales, predefined_t=None): + compressed_frames = video_frames//4+1 + if predefined_t and predefined_t != 'auto': + refined_predefined_t = [min(t, compressed_frames) for t in predefined_t] + full_ratio2hws = {} + for ratio, hws in ratio2hws.items(): + real_ratio = hws[-1][1] / hws[-1][2] + full_ratio2hws[int(real_ratio*1000)/1000] = hws + if ratio != 1.000: + full_ratio2hws[int(1/real_ratio*1000)/1000] = [(item[0], item[2], item[1]) for item in hws] + + dynamic_resolution_h_w = {} + for ratio in full_ratio2hws: + dynamic_resolution_h_w[ratio] = {} + for total_pixels, scales_num in total_pixels2scales.items(): + pixel = (full_ratio2hws[ratio][scales_num-1][1] * vae_stride, full_ratio2hws[ratio][scales_num-1][2] * vae_stride) + scales = full_ratio2hws[ratio][:scales_num] + if predefined_t and predefined_t != 'auto': + scales = [ (t, h, w) for t, (_, h, w) in zip(refined_predefined_t, scales) ] + elif predefined_t == 'auto': + refined_predefined_t = np.linspace(1, compressed_frames, scales_num).astype(int) + scales = [ (t, h, w) for t, (_, h, w) in zip(refined_predefined_t, scales) ] + dynamic_resolution_h_w[ratio][total_pixels] = { + 'pixel': pixel, + 'scales': scales + } + return dynamic_resolution_h_w + +def get_dynamic_resolution_meta(dynamic_scale_schedule, video_frames=1000): + dynamic_resolution_h_w = get_ratio2hws_pixels2scales(dynamic_scale_schedule, video_frames) + h_div_w_templates = [] + for h_div_w in dynamic_resolution_h_w.keys(): + h_div_w_templates.append(h_div_w) + h_div_w_templates = np.array(h_div_w_templates) + return dynamic_resolution_h_w, h_div_w_templates + +def get_h_div_w_template2indices(h_div_w_list, h_div_w_templates): + indices = list(range(len(h_div_w_list))) + h_div_w_template2indices = {} + pbar = tqdm.tqdm(total=len(indices), desc='get_h_div_w_template2indices...') + for h_div_w, index in zip(h_div_w_list, indices): + pbar.update(1) + nearest_h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w-h_div_w_templates))] + if nearest_h_div_w_template_ not in h_div_w_template2indices: + h_div_w_template2indices[nearest_h_div_w_template_] = [] + h_div_w_template2indices[nearest_h_div_w_template_].append(index) + for h_div_w_template_, sub_indices in h_div_w_template2indices.items(): + h_div_w_template2indices[h_div_w_template_] = np.array(sub_indices) + return h_div_w_template2indices + +def get_activated_h_div_w_templates(h_div_w_list, h_div_w_templates): + if h_div_w_list is None: + activated_h_div_w_templates = h_div_w_templates + else: + activated_h_div_w_templates = [] + h_div_w_templates = np.array(h_div_w_templates) + for h_div_w in h_div_w_list: + index = np.argmin(np.abs(h_div_w - h_div_w_templates)) + activated_h_div_w_templates.append(h_div_w_templates[index]) + activated_h_div_w_templates = sorted(list(set(activated_h_div_w_templates))) + return activated_h_div_w_templates + +if __name__ == '__main__': + video_frames = 81 + dynamic_resolution_h_w = get_ratio2hws_pixels2scales('infinity_elegant_clip20frames_v2', video_frames) + for h_div_w in dynamic_resolution_h_w: + if h_div_w >= 1: + for pn in ['0.25M']: + print(h_div_w, pn, np.array(dynamic_resolution_h_w[h_div_w][pn]['pt2scale_schedule'][1]).prod(-1).sum()) + + import pdb; pdb.set_trace() diff --git a/Meissonic/InfinityStar/infinity/schedules/infinity_elegant.py b/Meissonic/InfinityStar/infinity/schedules/infinity_elegant.py new file mode 100644 index 0000000000000000000000000000000000000000..d09186719968b6c50d66146ab6fe2a72fadd02e6 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/schedules/infinity_elegant.py @@ -0,0 +1,519 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +import json +import random + +import numpy as np +import torch +import torch.nn.functional as F + +def interpolate(tensor, size, mode, quantizer, is_semantic_scale): + """ + arguments: + tensor: (B,C,T,H,W) + size: (C1,T,H1,W1) + mode: str + quantizer: quantizer + is_semantic_scale: bool + return: + tensor: (B,*size) + """ + B, C, T, H, W = tensor.shape + C1, T, H1, W1 = size + if quantizer.other_args.use_learnable_dim_proj: + if is_semantic_scale: + if C > C1: + proj = quantizer.semantic_proj_down + elif C < C1: + proj = quantizer.semantic_proj_up + else: + if C > C1: + proj = quantizer.detail_proj_down + elif C < C1: + proj = quantizer.detail_proj_up + if C != C1: + tensor = tensor.permute(0,2,3,4,1) # (B,C,T,H,W) -> (B,T,H,W,C) + tensor = proj(tensor) # (B,T,H,W,C1) + tensor = tensor.permute(0,4,1,2,3) # (B,T,H,W,C1) -> (B,C1,T,H,W) + tensor = F.interpolate(tensor, size=(T, H1, W1), mode=mode) # (B,C1,T,H,W) -> (B,C1,T,H1,W1) + return tensor + else: + tensor = tensor.permute(0,2,1,3,4) # (B,C,T,H,W) -> (B,T,C,H,W) + tensor = F.interpolate(tensor, size=(C1, H1, W1), mode=mode) + tensor = tensor.permute(0,2,1,3,4) # (B,T,C1,H1,W1) -> (B,C1,T,H1,W1) + return tensor + +def get_scale_pack_info(scale_schedule, first_full_spatial_size_scale_index, args): + meta = {} + sid2clipid_innsid = {} + clipid_innsid2sid = {} + scales_per_clip = first_full_spatial_size_scale_index + 1 + compress_frames_inner_clip = args.frames_inner_clip + total_clips = len(scale_schedule) // scales_per_clip + context_clips = args.context_frames // args.frames_inner_clip + for si in range(len(scale_schedule)): + clipid = si // scales_per_clip + if clipid == 0: + frame_ss, frame_ee = 0, scale_schedule[scales_per_clip*(clipid+1)-1][0] # compressed_frame_ss and compressed_frame_ee + else: + frame_ss = scale_schedule[0][0] + (clipid-1) * compress_frames_inner_clip + frame_ee = frame_ss + scale_schedule[scales_per_clip*(clipid+1)-1][0] + if context_clips < total_clips-1: + assert scale_schedule[si][0] == compress_frames_inner_clip + sid2clipid_innsid[si] = (clipid, si % scales_per_clip) + clipid_innsid2sid[(clipid, si % scales_per_clip)] = si + # add clip ind for ref + if si <= first_full_spatial_size_scale_index: + meta[si] = { + 'clipid': clipid, + 'frame_ss': frame_ss, + 'frame_ee': frame_ee, + 'left_ref': [-1], + 'right_ref': [-1], + } + else: + meta[si] = { + 'clipid': clipid, + 'frame_ss': frame_ss, + 'frame_ee': frame_ee, + 'left_ref': [clipid-1], # list(range(clipid-1, -1, -1)), + 'right_ref': [-1], + } + meta[si]['left_ref'] = meta[si]['left_ref'][:context_clips] + # append inner scale ind to clip ind, (frame pack) + if args.context_from_largest_no > 0: + meta[si]['left_ref'] = [(meta[si]['left_ref'][i], max(0, scales_per_clip - args.context_from_largest_no - args.context_interval*i)) for i in range(len(meta[si]['left_ref']))] + meta[si]['right_ref'] = [(meta[si]['right_ref'][i], max(0, scales_per_clip - args.context_from_largest_no - args.context_interval*i)) for i in range(len(meta[si]['right_ref']))] + for si in meta: + if args.context_from_largest_no > 0: + meta[si]['left_ref_sids'], meta[si]['right_ref_sids'] = [], [] + for clipid, innsid in (meta[si]['left_ref']): + if clipid != -1: + meta[si]['left_ref_sids'].append(clipid_innsid2sid[(clipid, innsid)]) + for fid, innsid in (meta[si]['right_ref']): + if fid != -1: + meta[si]['right_ref_sids'].append(clipid_innsid2sid[(clipid, innsid)]) + meta[si]['ref_sids'] = meta[si]['left_ref_sids'] + meta[si]['right_ref_sids'] + else: + meta[si]['ref_sids'] = list(range(si)) + return meta + + +def video_encode( + vae, + inp_B3HW, + vae_features=None, + self_correction=None, + device='cuda', + args=None, + infer_mode=False, + rope2d_freqs_grid=None, + dynamic_resolution_h_w=None, + tokens_remain=9999999, + text_lens=[], + **kwargs, +): + return video_encode_global_bsc( + vae, + inp_B3HW, + vae_features, + self_correction, + device, + args, + infer_mode, + rope2d_freqs_grid, + dynamic_resolution_h_w, + tokens_remain, + text_lens, + **kwargs, + ) + +def video_encode_global_bsc( + vae, + inp_B3HW, + vae_features=None, + self_correction=None, + device='cuda', + args=None, + infer_mode=False, + rope2d_freqs_grid=None, + dynamic_resolution_h_w=None, + tokens_remain=9999999, + text_lens=[], + **kwargs, +): + if vae_features is None: + raw_features, _, _ = vae.encode_for_raw_features(inp_B3HW, scale_schedule=None, slice=True) + raw_features_list = [raw_features] + x_recon_raw = vae.decode(raw_features, slice=True) + x_recon_raw = torch.clamp(x_recon_raw, min=-1, max=1) + print(f'raw_features.shape: {raw_features.shape}') + else: + raw_features_list = vae_features + # raw_features_list: list of [1,d,t,h,w]: + gt_all_bit_indices = [] + pred_all_bit_indices = [] + var_input_list = [] + sequece_packing_scales = [] # with trunk + flatten_packing_scales = [] + h_div_w_template_list = np.array(list(dynamic_resolution_h_w.keys())) + visual_rope_cache_list = [] + noise_list = [] + scale_pack_info_list = [] + image_scale_repetition = json.loads(args.image_scale_repetition) + video_scale_repetition = json.loads(args.video_scale_repetition) + scales_in_one_clip = dynamic_resolution_h_w[h_div_w_template_list[0]][args.pn]['scales_in_one_clip'] + other_info_by_scale = [] + tokens_remain = tokens_remain-sum(text_lens) + examples = len(raw_features_list) + assert len(image_scale_repetition) == len(video_scale_repetition), f'{len(image_scale_repetition)} != {len(video_scale_repetition)}' + with torch.amp.autocast('cuda', enabled = False): + for example_ind, raw_features in enumerate(raw_features_list): + t, h, w = raw_features.shape[-3:] + h_div_w = h / w + mapped_h_div_w_template = h_div_w_template_list[np.argmin(np.abs(h_div_w-h_div_w_template_list))] + min_t = min(dynamic_resolution_h_w[mapped_h_div_w_template][args.pn]['pt2scale_schedule'].keys()) + image_scale_schedule = dynamic_resolution_h_w[mapped_h_div_w_template][args.pn]['pt2scale_schedule'][min_t] + scale_schedule = dynamic_resolution_h_w[mapped_h_div_w_template][args.pn]['pt2scale_schedule'][t] + + if args.apply_spatial_patchify: + vae_scale_schedule = [(pt, ph + (ph % 2), pw + (pw % 2)) for pt, ph, pw in scale_schedule] + else: + vae_scale_schedule = scale_schedule + first_full_spatial_size_scale_index = len(image_scale_schedule) - 1 + scale_pack_info = get_scale_pack_info(vae_scale_schedule, first_full_spatial_size_scale_index, args) + scale_pack_info_list.append(scale_pack_info) + + if raw_features.dim() == 4: + codes_out = raw_features.unsqueeze(2) # [B, d, t, h, w] + else: + codes_out = raw_features # [B, d, t, h, w] + # print(f'{raw_features.shape=}, {scale_schedule=}') + v_d = codes_out.shape[1] + B, C, T, H, W = codes_out.shape + if args.noise_input: + noise = torch.randn((B, v_d, *vae_scale_schedule[0]), device=device, dtype=raw_features.dtype) + else: + noise = torch.zeros((B, v_d, *vae_scale_schedule[0]), device=device, dtype=raw_features.dtype) + if infer_mode: noise_list.append(noise) + next_var_input = noise + valid_scales = len(vae_scale_schedule) + assert len(image_scale_repetition) == len(image_scale_schedule), f'{len(image_scale_repetition)} != {len(image_scale_schedule)}' + real_si = 0 + noise_apply_strength = self_correction.noise_apply_strength + if args.noise_apply_random_one: + image_scale_cnt = len(image_scale_schedule) + video_scale_cnt = len(vae_scale_schedule) + keep_image_si = random.randint(0, image_scale_cnt-1) + if video_scale_cnt == image_scale_cnt: + keep_video_si = keep_image_si + else: + keep_video_si = random.randint(image_scale_cnt, video_scale_cnt-1) + noise_apply_strength = [noise_prob if i == keep_image_si or i == keep_video_si else 0 for i, noise_prob in enumerate(noise_apply_strength)] + for si, (pt, ph, pw) in enumerate(vae_scale_schedule): + tokens_remain = tokens_remain - np.array(scale_schedule[si]).prod() + if tokens_remain < 0 and (not args.allow_less_one_elem_in_seq or examples > 1): + valid_scales = si + break + + rel_si_in_one_clip = si % len(image_scale_schedule) + if si < len(image_scale_schedule): # image + repeat_times = image_scale_repetition[rel_si_in_one_clip] + else: + repeat_times = video_scale_repetition[rel_si_in_one_clip] + select_repeat_idx = np.random.randint(0, repeat_times) + frame_ss, frame_ee = scale_pack_info[si]['frame_ss'], scale_pack_info[si]['frame_ee'] + target = codes_out[:,:,frame_ss:frame_ee] + for repeat_idx in range(repeat_times): + if (not infer_mode) and (repeat_idx==select_repeat_idx): + visual_rope_cache_list.append(get_visual_rope_embeds(rope2d_freqs_grid, scale_schedule, si, real_si, device, args, scale_pack_info, first_full_spatial_size_scale_index)) + + if next_var_input.shape[-3:] != target.shape[-3:]: + next_var_input = F.interpolate(next_var_input, size=target.shape[-3:], mode=vae.quantizer.z_interplote_up).contiguous() + cum_var_input = next_var_input + this_scale_var_input = F.interpolate(cum_var_input, size=vae_scale_schedule[si], mode=vae.quantizer.z_interplote_down).contiguous() + if repeat_idx > 0 and args.inner_scale_boost: + residual = residual - quantized + else: + residual = target - cum_var_input + if args.use_two_stage_lfq: + if ph * pw >= vae.quantizer.detail_scale_min_tokens: + is_semantic_scale = False + C1 = vae.quantizer.detail_scale_dim + lfq = vae.quantizer.lfq_detail + else: + is_semantic_scale = True + C1 = vae.quantizer.semantic_scale_dim + lfq = vae.quantizer.lfq_semantic + residual = interpolate(residual, size=(C1, *vae_scale_schedule[si]), mode=vae.quantizer.z_interplote_down, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + residual = F.interpolate(residual, size=vae_scale_schedule[si], mode=vae.quantizer.z_interplote_down).contiguous() + try: + lfq = vae.quantizer.lfq_detail + except: + lfq = vae.quantizer.lfq + quantized, _, bit_indices, loss = lfq(residual) # quantized shape: [B, d, t, h, w], bit_indices shape: [B,t,h,w,d] + + if args.reduce_accumulate_error_method == 'bsc': + if si < min(len(vae_scale_schedule)-1, self_correction.noise_apply_layers): + pred_bit_indices, quantized = self_correction.apply_noise_requant(bit_indices, quantized, args, device, si, lfq, noise_apply_strength) + else: + pred_bit_indices = bit_indices + else: + raise NotImplementedError(args.reduce_accumulate_error_method) + + if infer_mode or (repeat_idx==select_repeat_idx): + pred_all_bit_indices.append(pred_bit_indices) + var_input_list.append(this_scale_var_input) + gt_all_bit_indices.append(bit_indices) + other_info_by_scale.append({'largest_scale': scale_schedule[-1], 'real_si': si}) + if args.use_two_stage_lfq: + quantized_scaled = interpolate(quantized, size=target.shape[-4:], mode=vae.quantizer.z_interplote_up, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + quantized_scaled = F.interpolate(quantized, size=target.shape[-3:], mode=vae.quantizer.z_interplote_up).contiguous() + next_var_input = cum_var_input + quantized_scaled + real_si += 1 + + if si < len(vae_scale_schedule)-1: # since first scale is [sos], here we only need len(vae_scale_schedule)-1 cum_var_input and x_BLC_wo_prefix + if vae_scale_schedule[si][-2:] == vae_scale_schedule[-1][-2:]: + if args.noise_input: + next_var_input = torch.randn((B, v_d, *vae_scale_schedule[si+1]), device=device, dtype=raw_features.dtype) + else: + next_var_input = torch.zeros((B, v_d, *vae_scale_schedule[si+1]), device=device, dtype=raw_features.dtype) + if infer_mode: noise_list.append(next_var_input) + + sequece_packing_scales.append(scale_schedule[:valid_scales]) + flatten_packing_scales.extend(scale_schedule[:valid_scales]) + if infer_mode: + return noise_list, x_recon_raw, pred_all_bit_indices, None, None, scale_pack_info + + # train partial scales to enable training 480p without sp + if args.allow_less_one_elem_in_seq and len(sequece_packing_scales) == 1 and np.array(sequece_packing_scales[0]).prod(-1).sum() > args.train_max_token_len: + scale_schedule = sequece_packing_scales[0] + + if args.train_with_var_seq_len: + if len(scale_schedule) == scales_in_one_clip * 3: # train 10s video + outcomes = [ + lambda: list(range(scales_in_one_clip)), + lambda: list(range(scales_in_one_clip + 8)), + lambda: list(range(scales_in_one_clip + 11)), + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+11], + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+12], + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+13], + lambda: [scales_in_one_clip-1] + [2*scales_in_one_clip-1] + list(range(2*scales_in_one_clip, 2*scales_in_one_clip + 11)), + lambda: [scales_in_one_clip-1] + [2*scales_in_one_clip-1] + list(range(2*scales_in_one_clip, 2*scales_in_one_clip + 11)), + lambda: [scales_in_one_clip-1] + [2*scales_in_one_clip-1] + [2*scales_in_one_clip + 11], + lambda: [scales_in_one_clip-1] + [2*scales_in_one_clip-1] + [2*scales_in_one_clip + 12], + lambda: [scales_in_one_clip-1] + [2*scales_in_one_clip-1] + [2*scales_in_one_clip + 13], + ] + else: + if args.drop_720p_last_scale: + outcomes = [ + lambda: list(range(scales_in_one_clip)), + lambda: list(range(scales_in_one_clip + 8)), + lambda: list(range(scales_in_one_clip + 11)), + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+11], + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+12], + lambda: list(range(scales_in_one_clip + 8)) + [scales_in_one_clip+13], + ] + else: + outcomes = [ + lambda: list(range(scales_in_one_clip)), + lambda: list(range(scales_in_one_clip + 8)), + lambda: list(range(scales_in_one_clip + 11)), + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+11], + lambda: list(range(scales_in_one_clip + 11)) + [scales_in_one_clip+12], + lambda: [scales_in_one_clip-1] + [scales_in_one_clip+13], + lambda: [scales_in_one_clip-1] + [scales_in_one_clip+14], + ] + + probabilities = np.array(json.loads(args.video_var_len_prob), dtype=np.float32)[:len(outcomes)] + probabilities /= probabilities.sum() + + # Choose one of the outcome functions based on the probabilities and execute it + select_si_list = np.random.choice(outcomes, p=probabilities)() + + else: + select_si_list = [scales_in_one_clip-1] # context first fsuper_scale_lengthsrame must be selected + if args.train_192pshort: + # select_si_list.append(2*scales_in_one_clip-4) + if args.train_192pshort > 1: + select_si_list = list(range(0, scales_in_one_clip+args.train_192pshort)) + else: + select_si_list = list(range(0, scales_in_one_clip+11)) + else: + select_si_list = list(range(0, scales_in_one_clip)) # all first frame must be selected + select_si_list.append(scales_in_one_clip + np.random.choice([11, 12, 13], p=[0.7, 0.2, 0.1])) + + other_si_list = list(range(scales_in_one_clip-1)) + list(range(scales_in_one_clip, 2*scales_in_one_clip)) + other_si_list = list(set(other_si_list) - set(select_si_list)) + np.random.shuffle(other_si_list) + train_token_len = np.array(scale_schedule)[select_si_list].prod(-1).sum() + text_lens[0] + for si in other_si_list: + token_len = np.array(scale_schedule[si]).prod(-1).sum() + if train_token_len + token_len <= args.train_max_token_len: + train_token_len += token_len + select_si_list.append(si) + + select_si_list.sort() + new_si_2_real_si, real_si_2_new_si = {}, {} + for new_si, real_si in enumerate(select_si_list): + new_si_2_real_si[new_si] = real_si + real_si_2_new_si[real_si] = new_si + + sequece_packing_scales = [[scale_schedule[si] for si in select_si_list]] + flatten_packing_scales = [flatten_packing_scales[si] for si in select_si_list] + gt_all_bit_indices = [gt_all_bit_indices[si] for si in select_si_list] + pred_all_bit_indices = [pred_all_bit_indices[si] for si in select_si_list] + var_input_list = [var_input_list[si] for si in select_si_list] + visual_rope_cache_list = [visual_rope_cache_list[si] for si in select_si_list] + other_info_by_scale = [other_info_by_scale[si] for si in select_si_list] + + # remap scale_pack_info + new_scale_pack_info = {} + for new_query_sid in new_si_2_real_si: + real_query_sid = new_si_2_real_si[new_query_sid] + new_scale_pack_info[new_query_sid] = {'ref_sids': []} + for real_ref_sid in scale_pack_info_list[0][real_query_sid]['ref_sids']: + new_ref_sid = real_si_2_new_si[real_ref_sid] + new_scale_pack_info[new_query_sid]['ref_sids'].append(new_ref_sid) + scale_pack_info_list = [new_scale_pack_info] + + scale_lengths = [ pt * ph * pw for pt,ph,pw in flatten_packing_scales] + scale_lengths = scale_lengths + text_lens + valid_scales = len(flatten_packing_scales) + len(text_lens) + + cur_seq_len = np.sum(scale_lengths) + if args.train_with_var_seq_len: + pad_seq_len = int(np.ceil(cur_seq_len/args.pad_to_multiplier))*args.pad_to_multiplier - cur_seq_len + else: + pad_seq_len = args.train_max_token_len - cur_seq_len + assert pad_seq_len >= 0, f'pad_seq_len: {pad_seq_len} < 0, {scale_lengths=}' + if pad_seq_len: + scale_lengths = scale_lengths + [pad_seq_len] + max_sid_nums = 2000 + querysid_refsid = torch.zeros((max_sid_nums, max_sid_nums), device=args.device, dtype=torch.bool) # Attention! this shape should be the same for different iterations !!! + for i in range(valid_scales): + querysid_refsid[i][i] = True + base = 0 + for ind, scale_schedule in enumerate(sequece_packing_scales): + scale_pack_info = scale_pack_info_list[ind] + for local_querysid in range(len(scale_schedule)): + global_querysid = local_querysid + base + global_text_sid = len(flatten_packing_scales) + ind + querysid_refsid[global_querysid][global_text_sid] = True + for local_refsid in (scale_pack_info[local_querysid]['ref_sids']): + global_refsid = base + local_refsid + querysid_refsid[global_querysid][global_refsid] = True + base += len(scale_schedule) + + gt_ms_idx_Bl = [] + for item in gt_all_bit_indices: + if args.apply_spatial_patchify: + # item shape: (B,t,H,W,d) + item = item.permute(0,1,4,2,3) # (B,t,d,H,W) + # (B,t,d,H,W) -> (B,t,4d,H/2,W/2) + item = torch.nn.functional.pixel_unshuffle(item, 2) + _, tt, dd, hh, ww = item.shape + # (B,t,4d,H/2,W/2) -> (B,t,H/2,W/2,4d) -> (B,t*H/2*w/2,4d) + item = item.permute(0,1,3,4,2).reshape(B, tt*hh*ww, dd) + else: + _, tt, hh, ww, dd = item.shape + item = item.reshape(B, tt*hh*ww, dd) + gt_ms_idx_Bl.append(item.type(torch.long)) + gt_BLC = gt_ms_idx_Bl # torch.cat(gt_ms_idx_Bl, 1).contiguous().type(torch.long) + for i in range(len(var_input_list)): + if args.apply_spatial_patchify: + # (B,d,t,H,W) -> (B,t,d,H,W) -> (B,t,4d,H/2,W/2) -> (B,t,H/2,W/2,4d) + var_input_list[i] = torch.nn.functional.pixel_unshuffle(var_input_list[i].permute(0,2,1,3,4), 2).permute(0,1,3,4,2) + var_input_list[i] = var_input_list[i].reshape(B, -1, 4*vae.codebook_dim) + else: + # (B,d,t,H,W) -> (B,t,H,W,d) + var_input_list[i] = var_input_list[i].permute(0,2,3,4,1) + var_input_list[i] = var_input_list[i].reshape(B, -1, vae.codebook_dim) + x_BLC = torch.cat(var_input_list, 1) + visual_rope_cache = torch.cat(visual_rope_cache_list, dim=4) + x_BLC_mask = None + return x_BLC, x_BLC_mask, gt_BLC, pred_all_bit_indices, visual_rope_cache, sequece_packing_scales, scale_lengths, querysid_refsid, other_info_by_scale + + +def video_decode( + vae, + all_indices, + scale_schedule, + label_type, + args=None, + noise_list=None, + trunc_scales=-1, + **kwargs, +): + image_scale_repetition = json.loads(args.image_scale_repetition) + video_scale_repetition = json.loads(args.video_scale_repetition) + assert len(image_scale_repetition) == len(video_scale_repetition), f'{len(image_scale_repetition)} != {len(video_scale_repetition)}' + real_si = 0 + noise_ptr = 0 + summed_codes = [noise_list[noise_ptr]] + noise_ptr += 1 + v_d = summed_codes[0].shape[1] + for si, (pt, ph, pw) in enumerate(scale_schedule): + if trunc_scales > 0 and si >= trunc_scales: + break + if si < len(image_scale_repetition): # image + repeat_times = image_scale_repetition[si%len(image_scale_repetition)] + else: + repeat_times = video_scale_repetition[si%len(image_scale_repetition)] + for repeat_idx in range(repeat_times): + tgt_shape = (pt, scale_schedule[-1][-2], scale_schedule[-1][-1]) + if args.use_two_stage_lfq: + if ph * pw >= vae.quantizer.detail_scale_min_tokens: + is_semantic_scale = False + lfq = vae.quantizer.lfq_detail + else: + is_semantic_scale = True + lfq = vae.quantizer.lfq_semantic + codes = lfq.indices_to_codes(all_indices[real_si], label_type) + codes = interpolate(codes, size=(v_d, *tgt_shape), mode=vae.quantizer.z_interplote_up, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + codes = vae.quantizer.lfq_detail.indices_to_codes(all_indices[real_si], label_type) + codes = F.interpolate(codes, size=tgt_shape, mode=vae.quantizer.z_interplote_up).contiguous() + + summed_codes[-1] = F.interpolate(summed_codes[-1], size=tgt_shape, mode=vae.quantizer.z_interplote_up).contiguous() + summed_codes[-1] += codes + real_si += 1 + if si < len(scale_schedule) - 1: + if scale_schedule[si][-3:] == tgt_shape: + summed_codes.append(noise_list[noise_ptr]) + noise_ptr += 1 + if trunc_scales < 0: + assert real_si == len(all_indices), f'all_repeated_scales={real_si} != len(all_indices)={len(all_indices)}' + summed_codes = torch.cat(summed_codes, dim=-3) + x_recon = vae.decode(summed_codes, slice=True) + x_recon = torch.clamp(x_recon, min=-1, max=1) + return x_recon + +def get_visual_rope_embeds(rope2d_freqs_grid, scale_schedule, sid, real_sid, device=None, args=None, scale_pack_info=None, first_full_spatial_size_scale_index=None): + # freqs_scales: (2, max_scales, ceil(dim_div_2 / 4)) + # freqs_frames: (2, max_frames, ceil(dim_div_2 / 4)) + rope2d_freqs_grid['freqs_scales'] = rope2d_freqs_grid['freqs_scales'].to(device) + rope2d_freqs_grid['freqs_frames'] = rope2d_freqs_grid['freqs_frames'].to(device) + rope2d_freqs_grid['freqs_height'] = rope2d_freqs_grid['freqs_height'].to(device) + rope2d_freqs_grid['freqs_width'] = rope2d_freqs_grid['freqs_width'].to(device) + upt, uph, upw = scale_schedule[-1] + pt, ph, pw = scale_schedule[sid] + dim_div_2_div_4 = rope2d_freqs_grid['freqs_scales'].shape[2] + dim_div_2 = dim_div_2_div_4 * 4 + f_scales = rope2d_freqs_grid['freqs_scales'][:, real_sid].reshape(2, 1, dim_div_2_div_4) + frame_ss, frame_ee = scale_pack_info[sid]['frame_ss'], scale_pack_info[sid]['frame_ee'] + f_frames = rope2d_freqs_grid['freqs_frames'][:, frame_ss:frame_ee] + f_height = rope2d_freqs_grid['freqs_height'][:, (torch.arange(ph) * (uph / ph)).round().int()] + f_width = rope2d_freqs_grid['freqs_width'][:, (torch.arange(pw) * (upw / pw)).round().int()] + rope_embeds = torch.cat([ + f_scales[ :, :, None, None, None, :].expand(-1, -1, pt, ph, pw, -1), + f_frames[ :, None, :, None, None, :].expand(-1, 1, -1, ph, pw, -1), + f_height[ :, None, None, :, None, :].expand(-1, 1, pt, -1, pw, -1), + f_width[ :, None, None, None, :, :].expand(-1, 1, pt, ph, -1, -1), + ], dim=-1) # (2, 1, pt, ph, pw, dim_div_2) + rope_embeds = rope_embeds.reshape(2, 1, 1, 1, 1*pt*ph*pw, dim_div_2) # (2, 1, 1, 1, 1*pt*ph*pw, dim_div_2) + return rope_embeds diff --git a/Meissonic/InfinityStar/infinity/schedules/infinity_star_interact.py b/Meissonic/InfinityStar/infinity/schedules/infinity_star_interact.py new file mode 100644 index 0000000000000000000000000000000000000000..ade1b893d4d564e5c0ada5cac0405dd4251db825 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/schedules/infinity_star_interact.py @@ -0,0 +1,462 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +import json + +import numpy as np +import torch +import torch.nn.functional as F + +semantic_scale_ind = 7 +detail_frame_inds = [18,19] + +def flatten_two_level_list(two_level_list): + flatten_list = [] + for item in two_level_list: + flatten_list.extend(item) + return flatten_list + +def interpolate(tensor, size, mode, quantizer, is_semantic_scale): + """ + arguments: + tensor: (B,C,T,H,W) + size: (C1,T,H1,W1) + mode: str + quantizer: quantizer + is_semantic_scale: bool + return: + tensor: (B,*size) + """ + B, C, T, H, W = tensor.shape + C1, T, H1, W1 = size + if quantizer.other_args.use_learnable_dim_proj: + if is_semantic_scale: + if C > C1: + proj = quantizer.semantic_proj_down + elif C < C1: + proj = quantizer.semantic_proj_up + else: + if C > C1: + proj = quantizer.detail_proj_down + elif C < C1: + proj = quantizer.detail_proj_up + if C != C1: + tensor = tensor.permute(0,2,3,4,1) # (B,C,T,H,W) -> (B,T,H,W,C) + tensor = proj(tensor) # (B,T,H,W,C1) + tensor = tensor.permute(0,4,1,2,3) # (B,T,H,W,C1) -> (B,C1,T,H,W) + tensor = F.interpolate(tensor, size=(T, H1, W1), mode=mode) # (B,C1,T,H,W) -> (B,C1,T,H1,W1) + return tensor + else: + tensor = tensor.permute(0,2,1,3,4) # (B,C,T,H,W) -> (B,T,C,H,W) + tensor = F.interpolate(tensor, size=(C1, H1, W1), mode=mode) + tensor = tensor.permute(0,2,1,3,4) # (B,T,C1,H1,W1) -> (B,C1,T,H1,W1) + return tensor + +def get_scale_pack_info(scale_schedule, first_full_spatial_size_scale_index, args): + meta = {} + sid2clipid_innsid = {} + clipid_innsid2sid = {} + scales_per_clip = first_full_spatial_size_scale_index + 1 + compress_frames_inner_clip = args.frames_inner_clip + total_clips = len(scale_schedule) // scales_per_clip + context_clips = args.context_frames // args.frames_inner_clip + for si in range(len(scale_schedule)): + clipid = si // scales_per_clip + if clipid == 0: + frame_ss, frame_ee = 0, scale_schedule[scales_per_clip*1-1][0] + else: + frame_ss = scale_schedule[scales_per_clip*1-1][0] + (clipid-1) * compress_frames_inner_clip + frame_ee = frame_ss + scale_schedule[scales_per_clip*(clipid+1)-1][0] + if context_clips < total_clips-1: + assert scale_schedule[si][0] == compress_frames_inner_clip + sid2clipid_innsid[si] = (clipid, si % scales_per_clip) + clipid_innsid2sid[(clipid, si % scales_per_clip)] = si + # add clip ind for ref + if si <= first_full_spatial_size_scale_index: + meta[si] = { + 'clipid': clipid, + 'frame_ss': frame_ss, + 'frame_ee': frame_ee, + 'left_ref': [-1], + 'right_ref': [-1], + } + else: + meta[si] = { + 'clipid': clipid, + 'frame_ss': frame_ss, + 'frame_ee': frame_ee, + 'left_ref': [clipid-1], + 'right_ref': [-1], + } + # append inner scale ind to clip ind, (frame pack) + if args.context_from_largest_no > 0: + meta[si]['left_ref'] = [(meta[si]['left_ref'][i], max(0, scales_per_clip - args.context_from_largest_no - args.context_interval*i)) for i in range(len(meta[si]['left_ref']))] + meta[si]['right_ref'] = [(meta[si]['right_ref'][i], max(0, scales_per_clip - args.context_from_largest_no - args.context_interval*i)) for i in range(len(meta[si]['right_ref']))] + for si in meta: + meta[si]['left_ref_sids'], meta[si]['right_ref_sids'] = [], [] + for clipid, innsid in (meta[si]['left_ref']): + if clipid != -1: + meta[si]['left_ref_sids'].append(clipid_innsid2sid[(clipid, innsid)]) + for fid, innsid in (meta[si]['right_ref']): + if fid != -1: + meta[si]['right_ref_sids'].append(clipid_innsid2sid[(clipid, innsid)]) + meta[si]['ref_sids'] = meta[si]['left_ref_sids'] + meta[si]['right_ref_sids'] + return meta + + +def video_encode( + vae, + inp_B3HW, + vae_features=None, + self_correction=None, + device='cuda', + args=None, + infer_mode=False, + rope2d_freqs_grid=None, + dynamic_resolution_h_w=None, + text_lens=[], + caption_nums=None, + rank=0, + vis_verbose=False, + np_generator=None, + skip_last=0, + train_max_token_len=0, + first_frame_features=[], + **kwargs, +): + if vae_features is None: + raw_features, _, _ = vae.encode_for_raw_features(inp_B3HW, scale_schedule=None, slice=True) + raw_features_list = [raw_features] + x_recon_raw = vae.decode(raw_features[0], slice=True) + x_recon_raw = torch.clamp(x_recon_raw, min=-1, max=1) + print(f'raw_features.shape: {raw_features[0].shape}') + else: + raw_features_list = vae_features + + if np_generator is not None: + random_obj = np_generator + else: + random_obj = np.random.default_rng() + + # raw_features_list: list of [1,d,t,h,w]: + gt_all_bit_indices = [] + pred_all_bit_indices = [] + var_input_list = [] + sequece_packing_scales = [] # with trunk + flatten_packing_scales = [] + h_div_w_template_list = np.array(list(dynamic_resolution_h_w.keys())) + visual_rope_cache_list = [] + noise_list = [] + scale_pack_info_list = [] + image_scale_repetition = json.loads(args.image_scale_repetition) + video_scale_repetition = json.loads(args.video_scale_repetition) + scales_in_one_clip = dynamic_resolution_h_w[h_div_w_template_list[0]][args.pn]['scales_in_one_clip'] + other_info_by_scale = [] + select_repeat_idx_list = [] + examples = len(raw_features_list) + assert len(image_scale_repetition) == len(video_scale_repetition), f'{len(image_scale_repetition)} != {len(video_scale_repetition)}' + assert examples == 1, f'currently only support examples==1, buf found {examples=}' + with torch.amp.autocast('cuda', enabled = False): + for example_ind, complete_raw_features in enumerate(raw_features_list): + complete_raw_features = complete_raw_features[0] + if first_frame_features[example_ind] is None: + first_frame_feature_ = complete_raw_features[:,:,0:1] # [B,d,1,h,w] + else: + first_frame_feature_ = first_frame_features[example_ind][0] # [B,d,1,h,w] + # assert complete_raw_features.shape[-3] > 21 + # 前21帧,构成一个 I1V1 的 clip + # 后面的 t-21 帧,构成一个 V2 的 clip,它的 condition 是 V1 resize 后的结果和 V1 的最后一帧 + new_raw_features_list = [complete_raw_features[:,:,:21], complete_raw_features[:,:,21:]] + t, h, w = new_raw_features_list[0].shape[-3:] + h_div_w = h / w + mapped_h_div_w_template = h_div_w_template_list[np.argmin(np.abs(h_div_w-h_div_w_template_list))] + min_t = min(dynamic_resolution_h_w[mapped_h_div_w_template][args.pn]['pt2scale_schedule'].keys()) + image_scale_schedule = dynamic_resolution_h_w[mapped_h_div_w_template][args.pn]['pt2scale_schedule'][min_t] + scale_schedule = dynamic_resolution_h_w[mapped_h_div_w_template][args.pn]['pt2scale_schedule'][t] + + for ind, raw_features in enumerate(new_raw_features_list): + if raw_features.numel() == 0: + break + mode = 'first_iv_clip' + global_si_base = 0 + if ind == 1: + scale_schedule = scale_schedule[scales_in_one_clip:] + scale_schedule = [(raw_features.shape[-3], ph, pw) for pt, ph, pw in scale_schedule] + mode = 'second_v_clip' + global_si_base = sum(image_scale_repetition) + sum(video_scale_repetition) + + if args.apply_spatial_patchify: + vae_scale_schedule = [(pt, ph*2, pw*2) for pt, ph, pw in scale_schedule] + else: + vae_scale_schedule = scale_schedule + first_full_spatial_size_scale_index = len(image_scale_schedule) - 1 + scale_pack_info = get_scale_pack_info(vae_scale_schedule, first_full_spatial_size_scale_index, args) + scale_pack_info_list.append(scale_pack_info) + + if raw_features.dim() == 4: + codes_out = raw_features.unsqueeze(2) # [B, d, t, h, w] + else: + codes_out = raw_features # [B, d, t, h, w] + # print(f'{raw_features.shape=}, {scale_schedule=}') + v_d = codes_out.shape[1] + B, C, T, H, W = codes_out.shape + if args.noise_input: + noise = torch.randn((B, v_d, *vae_scale_schedule[0]), device=device, dtype=raw_features.dtype) + else: + noise = torch.zeros((B, v_d, *vae_scale_schedule[0]), device=device, dtype=raw_features.dtype) + if infer_mode: noise_list.append(noise) + next_var_input = noise + valid_scales = len(vae_scale_schedule) - skip_last + assert len(image_scale_repetition) == len(image_scale_schedule), f'{len(image_scale_repetition)} != {len(image_scale_schedule)}' + real_si = 0 + noise_apply_strength = self_correction.noise_apply_strength + for si in range(valid_scales): + pt, ph, pw = vae_scale_schedule[si] + rel_si_in_one_clip = si % len(image_scale_schedule) + if si < len(image_scale_schedule): # image + repeat_times = image_scale_repetition[rel_si_in_one_clip] + else: + repeat_times = video_scale_repetition[rel_si_in_one_clip] + select_repeat_idx = random_obj.integers(0, repeat_times) + select_repeat_idx_list.append(select_repeat_idx) + frame_ss, frame_ee = scale_pack_info[si]['frame_ss'], scale_pack_info[si]['frame_ee'] + target = codes_out[:,:,frame_ss:frame_ee] + for repeat_idx in range(repeat_times): + if (not infer_mode) and (repeat_idx==select_repeat_idx): + visual_rope_cache_list.append(get_visual_rope_embeds(rope2d_freqs_grid, scale_schedule[-1], scale_schedule[si], list(range(frame_ss, frame_ee)), real_si, device)) + if next_var_input.shape[-3:] != target.shape[-3:]: + next_var_input = F.interpolate(next_var_input, size=target.shape[-3:], mode=vae.quantizer.z_interplote_up).contiguous() + cum_var_input = next_var_input + this_scale_var_input = F.interpolate(cum_var_input, size=vae_scale_schedule[si], mode=vae.quantizer.z_interplote_down).contiguous() + residual = target - cum_var_input + if args.use_two_stage_lfq: + if rel_si_in_one_clip >= args.semantic_scales: + is_semantic_scale = False + C1 = vae.quantizer.detail_scale_dim + lfq = vae.quantizer.lfq_detail + else: + is_semantic_scale = True + C1 = vae.quantizer.semantic_scale_dim + lfq = vae.quantizer.lfq_semantic + residual = interpolate(residual, size=(C1, *vae_scale_schedule[si]), mode=vae.quantizer.z_interplote_down, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + residual = F.interpolate(residual, size=vae_scale_schedule[si], mode=vae.quantizer.z_interplote_down).contiguous() + try: + lfq = vae.quantizer.lfq_detail + except: + lfq = vae.quantizer.lfq + quantized, _, bit_indices, loss = lfq(residual) # quantized shape: [B, d, t, h, w], bit_indices shape: [B,t,h,w,d] + + if args.reduce_accumulate_error_method == 'bsc': + if si < min(len(vae_scale_schedule)-1, self_correction.noise_apply_layers): + pred_bit_indices, quantized = self_correction.apply_noise_requant(bit_indices, quantized, args, device, si, lfq, noise_apply_strength, num_lvl=2, np_generator=random_obj) + else: + pred_bit_indices = bit_indices + else: + raise NotImplementedError(args.reduce_accumulate_error_method) + + if infer_mode or (repeat_idx==select_repeat_idx): + pred_all_bit_indices.append(pred_bit_indices) + var_input_list.append(this_scale_var_input) + gt_all_bit_indices.append(bit_indices) + other_info_by_scale.append({'largest_scale': scale_schedule[-1], 'real_si': si, 'mode': mode, 'global_si': real_si+global_si_base}) + if args.use_two_stage_lfq: + quantized_scaled = interpolate(quantized, size=target.shape[-4:], mode=vae.quantizer.z_interplote_up, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + quantized_scaled = F.interpolate(quantized, size=target.shape[-3:], mode=vae.quantizer.z_interplote_up).contiguous() + next_var_input = cum_var_input + quantized_scaled + real_si += 1 + + if si < len(vae_scale_schedule)-1: # since first scale is [sos], here we only need len(vae_scale_schedule)-1 cum_var_input and x_BLC_wo_prefix + if vae_scale_schedule[si][-2:] == vae_scale_schedule[-1][-2:]: + if args.noise_input: + next_var_input = torch.randn((B, v_d, *vae_scale_schedule[si+1]), device=device, dtype=raw_features.dtype) + else: + next_var_input = torch.zeros((B, v_d, *vae_scale_schedule[si+1]), device=device, dtype=raw_features.dtype) + if infer_mode: noise_list.append(next_var_input) + + sequece_packing_scales.append(scale_schedule[:valid_scales]) + if ind == 0: + former_clip_features = raw_features[:,:,-20:] + + + if infer_mode: + return noise_list, x_recon_raw, pred_all_bit_indices, None, None, scale_pack_info + + if vis_verbose: + print(f'Rank={rank}, {sequece_packing_scales=} {select_repeat_idx_list=}', force=True) + + if args.train_second_clip_only: + drop_scales = len(sequece_packing_scales[0]) + sequece_packing_scales = sequece_packing_scales[1:] + scale_pack_info_list = scale_pack_info_list[1:] + gt_all_bit_indices = gt_all_bit_indices[drop_scales:] + pred_all_bit_indices = pred_all_bit_indices[drop_scales:] + other_info_by_scale = other_info_by_scale[drop_scales:] + var_input_list = var_input_list[drop_scales:] + visual_rope_cache_list = visual_rope_cache_list[drop_scales:] + + flatten_packing_scales = flatten_two_level_list(sequece_packing_scales) + + def add_noise(features, noise_choices=[0.00, 0.15, 0.30]): + feature_std = features.std() + rand_noise_strength = np.random.choice(noise_choices) + return features + rand_noise_strength * feature_std * torch.randn_like(features) + + # add conditions + semantic_condition = F.interpolate(former_clip_features, size=(20, *scale_schedule[semantic_scale_ind][-2:]), mode=vae.quantizer.z_interplote_down) + semantic_condition = add_noise(semantic_condition) + assert former_clip_features.shape[2] == 20 + detail_condition = torch.cat([first_frame_feature_, add_noise(former_clip_features[:,:,detail_frame_inds])], dim=2) + var_input_list.extend([semantic_condition, detail_condition]) + + visual_rope_cache_list.append(get_visual_rope_embeds(rope2d_freqs_grid, detail_condition.shape[-3:], semantic_condition.shape[-3:], list(range(1, 21)), 800, device)) + visual_rope_cache_list.append(get_visual_rope_embeds(rope2d_freqs_grid, detail_condition.shape[-3:], detail_condition.shape[-3:], [0]+[item+1 for item in detail_frame_inds], 801, device)) + + # set scale_lengths and querysid_refsid + scale_lengths = [ pt * ph * pw for pt,ph,pw in flatten_packing_scales] + scale_lengths = scale_lengths + [torch.tensor(semantic_condition.shape[-3:]).prod().item(), torch.tensor(detail_condition.shape[-3:]).prod().item()] + scale_lengths = scale_lengths + text_lens + + valid_scales = len(scale_lengths) + pad_seq_len = train_max_token_len - np.sum(scale_lengths) + assert pad_seq_len >= 0, f'pad_seq_len: {pad_seq_len} < 0, {scale_lengths=}' + if pad_seq_len: + scale_lengths = scale_lengths + [pad_seq_len] + max_sid_nums = 2000 + querysid_refsid = torch.zeros((max_sid_nums, max_sid_nums), device=args.device, dtype=torch.bool) # Attention! this shape should be the same for different iterations !!! + for i in range(valid_scales): + querysid_refsid[i][i] = True + base = 0 + for ind, scale_schedule in enumerate(sequece_packing_scales): + real_example_ind = ind // 2 # for each example, there are two scale_schedule + scale_pack_info = scale_pack_info_list[ind] + for local_querysid in range(len(scale_schedule)): + global_querysid = base + local_querysid + if other_info_by_scale[base+local_querysid]['mode'] == 'first_iv_clip': + global_text_sid = len(flatten_packing_scales) + 2 + sum(caption_nums[:real_example_ind]) + 0 + querysid_refsid[global_querysid][global_text_sid] = True + elif other_info_by_scale[base+local_querysid]['mode'] == 'second_v_clip': + global_text_sid = len(flatten_packing_scales) + 2 + sum(caption_nums[:real_example_ind]) + 1 + querysid_refsid[global_querysid][global_text_sid] = True + querysid_refsid[global_querysid][len(flatten_packing_scales)+0] = True # i can see semantic condition + querysid_refsid[global_querysid][len(flatten_packing_scales)+1] = True # i can see detail condition + else: + raise ValueError(f'Unknown mode: {other_info_by_scale[base+local_querysid]["mode"]}') + for local_refsid in (scale_pack_info[local_querysid]['ref_sids']): + global_refsid = base + local_refsid + querysid_refsid[global_querysid][global_refsid] = True + base += len(scale_schedule) + + gt_ms_idx_Bl = [] + for item in gt_all_bit_indices: + if args.apply_spatial_patchify: + # item shape: (B,t,H,W,d) + item = item.permute(0,1,4,2,3) # (B,t,d,H,W) + # (B,t,d,H,W) -> (B,t,4d,H/2,W/2) + item = torch.nn.functional.pixel_unshuffle(item, 2) + _, tt, dd, hh, ww = item.shape + # (B,t,4d,H/2,W/2) -> (B,t,H/2,W/2,4d) -> (B,t*H/2*w/2,4d) + item = item.permute(0,1,3,4,2).reshape(B, tt*hh*ww, dd) + else: + _, tt, hh, ww, dd = item.shape + item = item.reshape(B, tt*hh*ww, dd) + gt_ms_idx_Bl.append(item.type(torch.long)) + gt_BLC = gt_ms_idx_Bl # torch.cat(gt_ms_idx_Bl, 1).contiguous().type(torch.long) + for i in range(len(var_input_list)): + if args.apply_spatial_patchify: + # (B,d,t,H,W) -> (B,t,d,H,W) -> (B,t,4d,H/2,W/2) -> (B,t,H/2,W/2,4d) + var_input_list[i] = torch.nn.functional.pixel_unshuffle(var_input_list[i].permute(0,2,1,3,4), 2).permute(0,1,3,4,2) + var_input_list[i] = var_input_list[i].reshape(B, -1, 4*vae.codebook_dim) + else: + # (B,d,t,H,W) -> (B,t,H,W,d) + var_input_list[i] = var_input_list[i].permute(0,2,3,4,1) + var_input_list[i] = var_input_list[i].reshape(B, -1, vae.codebook_dim) + x_BLC = torch.cat(var_input_list, 1) + visual_rope_cache = torch.cat(visual_rope_cache_list, dim=4) + x_BLC_mask = None + return x_BLC, x_BLC_mask, gt_BLC, pred_all_bit_indices, visual_rope_cache, sequece_packing_scales, scale_lengths, querysid_refsid, other_info_by_scale, pad_seq_len + +def video_decode( + vae, + all_indices, + scale_schedule, + label_type, + args=None, + noise_list=None, + trunc_scales=-1, + **kwargs, +): + image_scale_repetition = json.loads(args.image_scale_repetition) + video_scale_repetition = json.loads(args.video_scale_repetition) + assert len(image_scale_repetition) == len(video_scale_repetition), f'{len(image_scale_repetition)} != {len(video_scale_repetition)}' + real_si = 0 + noise_ptr = 0 + summed_codes = [] + scales_in_one_clip = args.first_full_spatial_size_scale_index+1 + clips = len(noise_list) - 1 + for clip_id in range(clips): + if clip_id == 1: + scale_schedule = scale_schedule[(args.first_full_spatial_size_scale_index+1):] + t = all_indices[-1].shape[1] # [B,t,h,w,d] + scale_schedule = [(t, ph, pw) for pt, ph, pw in scale_schedule] + summed_codes.append(noise_list[noise_ptr]) + noise_ptr += 1 + v_d = summed_codes[0].shape[1] + for si, (pt, ph, pw) in enumerate(scale_schedule): + if si < len(image_scale_repetition): # image + repeat_times = image_scale_repetition[si%len(image_scale_repetition)] + else: + repeat_times = video_scale_repetition[si%len(image_scale_repetition)] + for repeat_idx in range(repeat_times): + tgt_shape = (pt, scale_schedule[-1][-2], scale_schedule[-1][-1]) + if args.use_two_stage_lfq: + if (si % scales_in_one_clip) >= args.semantic_scales: + is_semantic_scale = False + lfq = vae.quantizer.lfq_detail + else: + is_semantic_scale = True + lfq = vae.quantizer.lfq_semantic + codes = lfq.indices_to_codes(all_indices[real_si], label_type) + codes = interpolate(codes, size=(v_d, *tgt_shape), mode=vae.quantizer.z_interplote_up, quantizer=vae.quantizer, is_semantic_scale=is_semantic_scale).contiguous() + else: + codes = vae.quantizer.lfq_detail.indices_to_codes(all_indices[real_si], label_type) + codes = F.interpolate(codes, size=tgt_shape, mode=vae.quantizer.z_interplote_up).contiguous() + + summed_codes[-1] = F.interpolate(summed_codes[-1], size=tgt_shape, mode=vae.quantizer.z_interplote_up).contiguous() + summed_codes[-1] += codes + real_si += 1 + + if si < len(scale_schedule)-1 and scale_schedule[si][-2:] == tgt_shape[-2:]: + summed_codes.append(noise_list[noise_ptr]) + noise_ptr += 1 + + summed_codes = torch.cat(summed_codes, dim=-3) + x_recon = vae.decode(summed_codes, slice=True) + x_recon = torch.clamp(x_recon, min=-1, max=1) + return x_recon + +def get_visual_rope_embeds(rope2d_freqs_grid, largest_scale, current_scale, t_list, real_sid, device=None): + # freqs_scales: (2, max_scales, ceil(dim_div_2 / 4)) + # freqs_frames: (2, max_frames, ceil(dim_div_2 / 4)) + rope2d_freqs_grid['freqs_scales'] = rope2d_freqs_grid['freqs_scales'].to(device) + rope2d_freqs_grid['freqs_frames'] = rope2d_freqs_grid['freqs_frames'].to(device) + rope2d_freqs_grid['freqs_height'] = rope2d_freqs_grid['freqs_height'].to(device) + rope2d_freqs_grid['freqs_width'] = rope2d_freqs_grid['freqs_width'].to(device) + _, uph, upw = largest_scale + pt, ph, pw = current_scale + dim_div_2_div_4 = rope2d_freqs_grid['freqs_scales'].shape[2] + dim_div_2 = dim_div_2_div_4 * 4 + f_scales = rope2d_freqs_grid['freqs_scales'][:, real_sid].reshape(2, 1, dim_div_2_div_4) + f_frames = rope2d_freqs_grid['freqs_frames'][:, t_list] + f_height = rope2d_freqs_grid['freqs_height'][:, (torch.arange(ph) * (uph / ph)).round().int()] + f_width = rope2d_freqs_grid['freqs_width'][:, (torch.arange(pw) * (upw / pw)).round().int()] + rope_embeds = torch.cat([ + f_scales[ :, :, None, None, None, :].expand(-1, -1, pt, ph, pw, -1), + f_frames[ :, None, :, None, None, :].expand(-1, 1, -1, ph, pw, -1), + f_height[ :, None, None, :, None, :].expand(-1, 1, pt, -1, pw, -1), + f_width[ :, None, None, None, :, :].expand(-1, 1, pt, ph, -1, -1), + ], dim=-1) # (2, 1, pt, ph, pw, dim_div_2) + rope_embeds = rope_embeds.reshape(2, 1, 1, 1, 1*pt*ph*pw, dim_div_2) # (2, 1, 1, 1, 1*pt*ph*pw, dim_div_2) + return rope_embeds diff --git a/Meissonic/InfinityStar/infinity/trainer/__init__.py b/Meissonic/InfinityStar/infinity/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4cfb005f58554eba13727e2b2a086129d53ca984 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/trainer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +def get_trainer(args): + from infinity.trainer.sft_trainer import InfinityTrainer as Trainer + return Trainer \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/trainer/sft_trainer.py b/Meissonic/InfinityStar/infinity/trainer/sft_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..d8447fb399a49328c14223b505ee4c6f78b42527 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/trainer/sft_trainer.py @@ -0,0 +1,377 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from pprint import pformat +from typing import Optional, Tuple, Union +import os +import os.path as osp + +import torch +import torch.nn as nn +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.api import FullOptimStateDictConfig, FullStateDictConfig, StateDictType +from torch.nn.parallel import DistributedDataParallel as DDP +import numpy as np +import torch.distributed as tdist + +import infinity.utils.dist as dist +from infinity.models import Infinity +from infinity.models.ema import update_ema +from infinity.models.self_correction import SelfCorrection +from infinity.utils import arg_util, misc, wandb_utils +from infinity.utils.amp_opt import AmpOptimizer +from infinity.schedules import get_encode_decode_func +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta + +Ten = torch.Tensor +FTen = torch.Tensor +ITen = torch.LongTensor +BTen = torch.BoolTensor +fullstate_save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) +fulloptstate_save_policy = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True) + +import queue +import threading + +def save_token(): + while True: + try: + raw_features, feature_cache_files4images = save_token_queue.get() + for i in range(len(feature_cache_files4images)): + if not osp.exists(feature_cache_files4images[i]): + os.makedirs(osp.dirname(feature_cache_files4images[i]), exist_ok=True) + torch.save(raw_features[i], feature_cache_files4images[i]) + print(f'Save to {feature_cache_files4images[i]}') + else: + print(f'{feature_cache_files4images[i]} exists, skip') + except Exception as e: + print(f"Error saving token: {e}") + finally: + save_token_queue.task_done() + +save_token_queue = queue.Queue() +saver = threading.Thread(target=save_token, daemon=True) +saver.start() + +class InfinityTrainer(object): + def __init__( + self, + device, + raw_scale_schedule: Tuple[int, ...], + vae_local, + gpt_wo_ddp: Infinity, gpt: DDP, + gpt_opt: AmpOptimizer, + label_smooth: float, + zero=0, + vae_type=True, + reweight_loss_by_scale=0, + gpt_wo_ddp_ema=None, + gpt_ema=None, + use_fsdp_model_ema=False, + other_args=None, + ): + super(InfinityTrainer, self).__init__() + + self.zero = zero + self.vae_type = vae_type + + self.gpt: Union[DDP, FSDP, nn.Module] + self.gpt, self.vae_local = gpt, vae_local + self.dynamic_scale_schedule = other_args.dynamic_scale_schedule + self.steps_per_frame = other_args.steps_per_frame + self.dynamic_resolution_h_w, self.h_div_w_templates = get_dynamic_resolution_meta(other_args.dynamic_scale_schedule, other_args.video_frames) + self.gpt_opt: AmpOptimizer = gpt_opt + self.gpt_wo_ddp: Union[Infinity, torch._dynamo.eval_frame.OptimizedModule] = gpt_wo_ddp # after torch.compile + self.gpt_wo_ddp_ema = gpt_wo_ddp_ema + self.gpt_ema = gpt_ema + self.self_correction = SelfCorrection(self.vae_local, other_args) + self.use_fsdp_model_ema = use_fsdp_model_ema + self.batch_size, self.seq_len = 0, 0 + self.reweight_loss_by_scale = reweight_loss_by_scale + print(f'self.reweight_loss_by_scale: {self.reweight_loss_by_scale}') + video_encode, _, _, _ = get_encode_decode_func(other_args.dynamic_scale_schedule) + self.video_encode = video_encode + + gpt_uncompiled = self.gpt_wo_ddp._orig_mod if hasattr(self.gpt_wo_ddp, '_orig_mod') else self.gpt_wo_ddp + del gpt_uncompiled.rng + gpt_uncompiled.rng = torch.Generator(device=device) + del gpt_uncompiled + + self.label_smooth = label_smooth + + self.train_loss = nn.CrossEntropyLoss(label_smoothing=label_smooth, reduction='none') + self.val_loss = nn.CrossEntropyLoss(label_smoothing=0.0, reduction='none') + self.loss_weight = {0:{}, 1:{}} + + self.prog_it = 0 + self.last_prog_si = -1 + self.first_prog = True + self.generator = np.random.default_rng(0) + + def train_step( + self, epoch: int, it: int, g_it: int, stepping: bool, clip_decay_ratio: float, metric_lg: misc.MetricLogger, + raw_features_bcthw: FTen, feature_cache_files4images: list, media: str, + inp_B3HW: FTen, text_cond_tuple: Union[ITen, FTen], args: arg_util.Args, + ) -> Tuple[torch.Tensor, Optional[float]]: + device = args.device + B = len(inp_B3HW) + len(raw_features_bcthw) + + if media == 'images': + is_image_batch = 1 + else: + is_image_batch = 0 + # [forward] + with self.gpt_opt.amp_ctx: + with torch.amp.autocast('cuda', enabled=False): + raw_features_list = [] + if len(inp_B3HW): + with torch.no_grad(): + for inp_ind, inp in enumerate(inp_B3HW): + raw_features_, _, _ = self.vae_local.encode_for_raw_features(inp.unsqueeze(0), scale_schedule=None, slice=args.use_slice) + raw_features_list.append(raw_features_) + if args.use_vae_token_cache and args.save_vae_token_cache and (not osp.exists(feature_cache_files4images[inp_ind])): + os.makedirs(osp.dirname(feature_cache_files4images[inp_ind]), exist_ok=True) + save_token_queue.put((raw_features_.cpu().data, [feature_cache_files4images[inp_ind]])) + if len(raw_features_bcthw): + raw_features_bcthw = [item.unsqueeze(0) for item in raw_features_bcthw] + raw_features_list = raw_features_list + raw_features_bcthw + + full_pts_this_batch = [item.shape[-3] for item in raw_features_list] + kv_compact, lens, cu_seqlens_k, max_seqlen_k = text_cond_tuple + x_BLC, x_BLC_mask, gt_BLC, pred_all_bit_indices, visual_rope_cache, sequece_packing_scales, super_scale_lengths, super_querysid_super_refsid, other_info_by_scale = self.video_encode( + vae=self.vae_local, + inp_B3HW=None, + vae_features=raw_features_list, + self_correction=self.self_correction, + args=args, + device=device, + rope2d_freqs_grid=self.gpt.rope2d_freqs_grid, + dynamic_resolution_h_w=self.dynamic_resolution_h_w, + text_lens=lens, + tokens_remain=args.train_max_token_len, + ) + + loss, acc_bit, valid_sequence_ratio = self.gpt( + text_cond_tuple, + x_BLC, + gt_BL=gt_BLC, + is_image_batch=is_image_batch, + visual_rope_cache=visual_rope_cache, + sequece_packing_scales=sequece_packing_scales, + super_scale_lengths=super_scale_lengths, + super_querysid_super_refsid=super_querysid_super_refsid, + other_info_by_scale=other_info_by_scale, + ) # loss & acc_bit: [seq_len] + + # [loss reweight] + # import pdb; pdb.set_trace() + acc_pt2scale_acc = {} + acc_pt2scale_acc_counter = {} + for full_pt, scale_schedule in self.dynamic_resolution_h_w[self.h_div_w_templates[0]][args.pn]['pt2scale_schedule'].items(): + acc_pt2scale_acc[full_pt] = [[] for _ in range(len(scale_schedule))] + acc_pt2scale_acc_counter[full_pt] = [0 for _ in range(len(scale_schedule))] + + flatten_L_list, flatten_acc_bit_list, flatten_weight_list = [], [], [] + ptr = 0 + global_scale_ind = 0 + for sample_ind, item in enumerate(sequece_packing_scales): + full_pt = full_pts_this_batch[sample_ind] + for si, (pt, ph, pw) in enumerate(item): + mul_pt_ph_pw = pt * ph * pw + start, end = ptr, ptr+mul_pt_ph_pw + ptr = end + if x_BLC_mask is None: + loss_this_scale = loss[start:end].mean() + acc_this_scale = acc_bit[start:end].mean() + else: + pred_elem_num = x_BLC_mask[start:end].sum() + assert pred_elem_num > 0 + loss_this_scale = loss[start:end].sum() / pred_elem_num + acc_this_scale = acc_bit[start:end].sum() / pred_elem_num + real_si = other_info_by_scale[global_scale_ind]['real_si'] + volume_times = np.array(other_info_by_scale[global_scale_ind]['largest_scale']).prod() / mul_pt_ph_pw + acc_pt2scale_acc[full_pt][real_si].append(acc_this_scale) + acc_pt2scale_acc_counter[full_pt][real_si] += 1 + if self.reweight_loss_by_scale == 0: + weight = 1 * mul_pt_ph_pw + else: + reweight_value = min(args.max_reweight_value, np.power(volume_times, 1/(1+self.reweight_loss_by_scale))) + weight = reweight_value * mul_pt_ph_pw + flatten_weight_list.append(weight) + flatten_L_list.append(loss_this_scale) + flatten_acc_bit_list.append(acc_this_scale) + global_scale_ind += 1 + flatten_weight_list = torch.tensor(flatten_weight_list, dtype=loss.dtype, device=loss.device) + flatten_weight_list = flatten_weight_list / flatten_weight_list.sum() + final_loss = (torch.stack(flatten_L_list) * flatten_weight_list).sum() + final_acc_bit = (torch.stack(flatten_acc_bit_list) * flatten_weight_list).sum() + + # [backward] + grad_norm_t, scale_log2_t = self.gpt_opt.backward_clip_step(ep=epoch, it=it, g_it=g_it, stepping=stepping, loss=final_loss, clip_decay_ratio=clip_decay_ratio) + + # update ema + if args.use_fsdp_model_ema and (args.model_ema_decay < 1): + update_ema(self.gpt_ema, self.gpt) + + # [zero_grad] + if stepping: + self.gpt_opt.optimizer.zero_grad(set_to_none=True) + + # [metric logging] + if metric_lg.log_every_iter or it == 0 or it in metric_lg.log_iters: + def sum_dict(acc_pt2scale_acc): + for full_pt in acc_pt2scale_acc: + for si in range(len(acc_pt2scale_acc[full_pt])): + acc_pt2scale_acc[full_pt][si] = torch.tensor(acc_pt2scale_acc[full_pt][si]).sum() + return acc_pt2scale_acc + + def dict2list(acc_pt2scale_acc): + flatten_acc_pt2scale_acc = [] + for key, val in acc_pt2scale_acc.items(): + flatten_acc_pt2scale_acc.extend(val) + return flatten_acc_pt2scale_acc + + def list2dict(acc_pt2scale_acc, flatten_acc_pt2scale_acc): + ptr = 0 + for key in acc_pt2scale_acc: + for ind in range(len(acc_pt2scale_acc[key])): + acc_pt2scale_acc[key][ind] = flatten_acc_pt2scale_acc[ptr] + ptr += 1 + return acc_pt2scale_acc + + acc_pt2scale_acc = sum_dict(acc_pt2scale_acc) + flatten_acc_pt2scale_acc = dict2list(acc_pt2scale_acc) + flatten_acc_pt2scale_acc_counter = dict2list(acc_pt2scale_acc_counter) + + train_loss = final_loss.item() + train_acc = final_acc_bit.item() + metrics = torch.tensor(flatten_acc_pt2scale_acc + flatten_acc_pt2scale_acc_counter + [grad_norm_t.item(), train_loss, train_acc, is_image_batch, valid_sequence_ratio], device=loss.device) + tdist.all_reduce(metrics, op=tdist.ReduceOp.SUM) + flatten_acc_pt2scale_acc, flatten_acc_pt2scale_acc_counter = metrics[:len(flatten_acc_pt2scale_acc)], metrics[len(flatten_acc_pt2scale_acc):2*len(flatten_acc_pt2scale_acc)] + flatten_acc_pt2scale_acc = flatten_acc_pt2scale_acc / (flatten_acc_pt2scale_acc_counter + 1e-16) + acc_pt2scale_acc = list2dict(acc_pt2scale_acc, flatten_acc_pt2scale_acc) + acc_pt2scale_acc_counter = list2dict(acc_pt2scale_acc_counter, flatten_acc_pt2scale_acc_counter) + grad_norm_t, train_loss, train_acc, is_image_batch, valid_sequence_ratio = metrics[2*len(flatten_acc_pt2scale_acc):] / (dist.get_world_size() + 1e-16) + if args.num_of_label_value == 1: + key, base = 'Loss', 1 + else: + key, base = 'Acc', 100 + metric_lg.update(L=train_loss, Acc=train_acc*base, L_i=0., Acc_i=0., L_v=0., Acc_v=0., tnm=grad_norm_t, seq_usage=valid_sequence_ratio*100.) # todo: Accm, Acct + wandb_log_dict = { + 'Overall/train_loss': train_loss, + 'Overall/train_acc': train_acc*base, + 'Overall/grad_norm_t': grad_norm_t, + 'Overall/video_batch_ratio': (1-is_image_batch)*100., + 'Overall/valid_sequence_ratio': valid_sequence_ratio*100., + } + for full_pt in acc_pt2scale_acc: + for si in range(len(acc_pt2scale_acc[full_pt])): + if acc_pt2scale_acc_counter[full_pt][si] > 0: + duration = (full_pt-1) / args.temporal_compress_rate + wandb_log_dict[f'Details/{key}/t{duration:04.1f}s/s{si+1:03d}'] = acc_pt2scale_acc[full_pt][si].item() * base + wandb_log_dict[f'Details/Num/t{duration:04.1f}s/s{si+1:03d}'] = acc_pt2scale_acc_counter[full_pt][si] + wandb_utils.log(wandb_log_dict, step=g_it) + return grad_norm_t, scale_log2_t + + def __repr__(self): + return ( + f'\n' + f'[VGPTTr.config]: {pformat(self.get_config(), indent=2, width=250)}\n' + f'[VGPTTr.structure]: {super(InfinityTrainer, self).__repr__().replace(InfinityTrainer.__name__, "")}' + ) + + def ema_load(self): + self.cached_state_not_ema = {k: v.cpu() for k, v in self.gpt_wo_ddp.state_dict().items()} + for pi, p_ema in self.pi_para_copy_for_parallel_ema: + self.gpt_opt.paras[pi].data.copy_(p_ema) + for pi, para in enumerate(self.gpt_opt.paras): + dist.broadcast(para, src_rank=pi % dist.get_world_size()) + + def ema_recover(self): + self.gpt_wo_ddp.load_state_dict(self.cached_state_not_ema) + del self.cached_state_not_ema + self.cached_state_not_ema = None + + def get_config(self): + return { + 'label_smooth': self.label_smooth, + 'prog_it': self.prog_it, 'last_prog_si': self.last_prog_si, 'first_prog': self.first_prog, + } + + def state_dict(self): + m = self.vae_local + if hasattr(m, '_orig_mod'): + m = m._orig_mod + state = {'config': self.get_config(), 'vae_local': m.state_dict()} + + if self.zero: # TODO: fixme + state['gpt_fsdp'] = None + with FSDP.state_dict_type(self.gpt, StateDictType.FULL_STATE_DICT, fullstate_save_policy, fulloptstate_save_policy): + state['gpt_fsdp'] = self.gpt.state_dict() + if self.use_fsdp_model_ema: + state['gpt_ema_fsdp'] = self.gpt_ema.state_dict() + state['gpt_fsdp_opt'] = FSDP.optim_state_dict(model=self.gpt, optim=self.gpt_opt.optimizer, optim_state_dict=self.gpt_opt.optimizer.state_dict()) + if self.gpt_opt.scaler is not None: + state['gpt_opt_scaler'] = self.gpt_opt.scaler.state_dict() + + else: + + for k in ('gpt_wo_ddp', 'gpt_opt'): + m = getattr(self, k) + if m is not None: + if hasattr(m, '_orig_mod'): + m = m._orig_mod + state[k] = m.state_dict() + return state + + def load_state_dict(self, state, strict=True, skip_vae=False): + if self.zero: + with FSDP.state_dict_type(self.gpt, StateDictType.FULL_STATE_DICT, fullstate_save_policy, fulloptstate_save_policy): + self.gpt.load_state_dict(state['gpt_fsdp']) + if self.use_fsdp_model_ema: + self.gpt_ema.load_state_dict(state['gpt_ema_fsdp']) + one_group_opt_state = state['gpt_fsdp_opt'] + """ + AdamW state['gpt_fsdp_opt']: + { + 'state': { : {'exp_avg': , 'exp_avg_sq': , 'step': } }, + 'param_groups': [ + { + 'wd_sc': 1.0, 'lr_sc': 1.0, 'lr': xxx, 'betas': (0.9, 0.97), 'eps': 1e-08, 'weight_decay': 0.02, + 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': True, + 'params': [ x m] + } x n + ] + } + one_group_opt_state['param_groups'] = self.gpt_opt.optimizer.state_dict()['param_groups'] + """ + optim_state_dict = FSDP.optim_state_dict_to_load(model=self.gpt, optim=self.gpt_opt.optimizer, optim_state_dict=one_group_opt_state) + self.gpt_opt.optimizer.load_state_dict(optim_state_dict) + + if self.gpt_opt.scaler is not None: + try: self.gpt_opt.scaler.load_state_dict(state['gpt_opt_scaler']) + except Exception as e: print(f'[fp16 load_state_dict err] {e}') + else: + for k in ('gpt_wo_ddp', 'gpt_opt'): + if skip_vae and 'vae' in k: continue + m = getattr(self, k) + if m is not None: + if hasattr(m, '_orig_mod'): + m = m._orig_mod + ret = m.load_state_dict(state[k], strict=strict) + if ret is not None: + missing, unexpected = ret + print(f'[VGPTTr.load_state_dict] {k} missing: {missing}') + print(f'[VGPTTr.load_state_dict] {k} unexpected: {unexpected}') + + config: dict = state.pop('config', None) + self.prog_it = config.get('prog_it', 0) + self.last_prog_si = config.get('last_prog_si', -1) + self.first_prog = config.get('first_prog', True) + if config is not None: + for k, v in self.get_config().items(): + if config.get(k, None) != v: + err = f'[VGPT.load_state_dict] config mismatch: this.{k}={v} (ckpt.{k}={config.get(k, None)})' + if strict: + raise AttributeError(err) + else: + print(err) diff --git a/Meissonic/InfinityStar/infinity/utils/__pycache__/arg_util.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/__pycache__/arg_util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91f73f6fd34d95fb5c9d6a8836561ba10500e30f Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/__pycache__/arg_util.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/__pycache__/dist.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/__pycache__/dist.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..825fefb2a74aa145a89894bfa08271eaa6312448 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/__pycache__/dist.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/__pycache__/load.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/__pycache__/load.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a79eabf708070cb8b20e3bfcf5f77833f4d82f8 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/__pycache__/load.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/__pycache__/save_and_load.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/__pycache__/save_and_load.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e59569572c4d9f71c25665a145048ca230ec7490 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/__pycache__/save_and_load.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/__pycache__/sequence_parallel.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/__pycache__/sequence_parallel.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14009502c8e66dff1edd3554f99c29570595e05e Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/__pycache__/sequence_parallel.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/__pycache__/video_decoder.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/__pycache__/video_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e60a14b39347a7a278ab998aa26f929186edbdd Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/__pycache__/video_decoder.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/amp_opt.py b/Meissonic/InfinityStar/infinity/utils/amp_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..3047258632b8f28744905efafff3695d21347a9d --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/amp_opt.py @@ -0,0 +1,155 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import math +import os +import signal +import sys +import time +from typing import List, Optional, Tuple, Union + +import torch +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +# from memory_profiler import profile + +import infinity.utils.dist as dist + +class NullCtx: + def __enter__(self): + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + +class AmpOptimizer: + def __init__( + self, + model_name_3letters: str, mixed_precision: int, + optimizer: torch.optim.Optimizer, model_maybe_fsdp: Union[torch.nn.Module, FSDP], + r_accu: float, grad_clip: float, zero: int, + ): + self.enable_amp = mixed_precision > 0 + self.zero = zero + if self.enable_amp: + self.using_fp16_rather_bf16 = mixed_precision != 2 + self.max_sc = float(mixed_precision if mixed_precision > 128 else 32768) + + self.amp_ctx = torch.autocast('cuda', enabled=True, dtype=torch.float16 if self.using_fp16_rather_bf16 else torch.bfloat16, cache_enabled=self.zero == 0) # todo: cache_enabled=False + if self.using_fp16_rather_bf16: + self.scaler = torch.cuda.amp.GradScaler(init_scale=2. ** 11, growth_interval=1000) + else: + self.scaler = None + else: + self.using_fp16_rather_bf16 = True + self.amp_ctx = NullCtx() + self.scaler = None + + t = torch.zeros(dist.get_world_size()) + t[dist.get_rank()] = float(self.enable_amp) + dist.allreduce(t) + assert round(t.sum().item()) in {0, dist.get_world_size()}, f'enable_amp: {t}' + + t = torch.zeros(dist.get_world_size()) + t[dist.get_rank()] = float(self.using_fp16_rather_bf16) + dist.allreduce(t) + assert round(t.sum().item()) in {0, dist.get_world_size()}, f'using_fp16_rather_bf16: {t}' + + self.model_name_3letters = model_name_3letters + self.optimizer, self.model_maybe_fsdp = optimizer, model_maybe_fsdp + self.r_accu = r_accu + + self.paras = self.names = ... # todo: solve EMA-related codes + + self.grad_clip, self.grad_clip_we = grad_clip, 0 # todo: disable wclip + if self.grad_clip > 100: + self.grad_clip %= 100 + self.per_param = True + else: + self.per_param = False + self.per_param = False # todo: disable wclip + + self.early_clipping = grad_clip > 0 and not hasattr(optimizer, 'global_grad_norm') + self.late_clipping = grad_clip > 0 and hasattr(optimizer, 'global_grad_norm') # deepspeed's optimizer + + self.fp = None + self.last_orig_norm: torch.Tensor = torch.tensor(0.1) + + + # @profile(precision=4, stream=open('amp_sc.log', 'w+')) + def backward_clip_step( + self, ep: int, it: int, g_it: int, stepping: bool, loss: torch.Tensor, clip_decay_ratio=1, stable=False, + ) -> Tuple[torch.Tensor, Optional[float]]: + # backward + loss = loss.mul(self.r_accu) # r_accu == 1.0 / n_gradient_accumulation + orig_norm = scaler_sc = None + # if self.fp is not None: + # if g_it % 20 == 0: self.fp.seek(0); self.fp.truncate(0) + if self.scaler is not None: + self.scaler.scale(loss).backward(retain_graph=False, create_graph=False) # retain_graph=retain_graph, create_graph=create_graph + else: + loss.backward(retain_graph=False, create_graph=False) + # if self.fp is not None: self.fp.write(f'[backward_clip_step:131] [it{it}, g_it{g_it}] after backward\n'); self.fp.flush() + + # clip gradients then step optimizer + if stepping: + if self.scaler is not None: self.scaler.unscale_(self.optimizer) # now the gradient can be correctly got + # if self.fp is not None: self.fp.write(f'[backward_clip_step:137] [it{it}, g_it{g_it}] after scaler.unscale_\n'); self.fp.flush() + + skipped, orig_norm = 0, self.last_orig_norm + # try: + if self.fp is not None: + if g_it % 10 == 0: self.fp.seek(0); self.fp.truncate(0) + self.fp.write(f'\n'); self.fp.flush() + if self.early_clipping: + c = self.grad_clip * clip_decay_ratio + if self.zero: + orig_norm: Optional[torch.Tensor] = self.model_maybe_fsdp.clip_grad_norm_(c) + else: + orig_norm: Optional[torch.Tensor] = torch.nn.utils.clip_grad_norm_(self.model_maybe_fsdp.parameters(), c) + + # if self.fp is not None: self.fp.write(f'[backward_clip_step:175] [it{it}, g_it{g_it}] before opt step\n'); self.fp.flush() + if self.scaler is not None: + self.scaler: torch.cuda.amp.GradScaler + if self.zero: + # synchronize found_inf_per_device before calling step, so that even if only some ranks found inf on their sharded params, all other ranks will know + # otherwise, when saving FSDP optimizer state, it will cause AssertionError saying "Different ranks have different values for step." + for optimizer_state in self.scaler._per_optimizer_states.values(): + for t in optimizer_state['found_inf_per_device'].values(): + dist.allreduce(t) # ideally, each rank only has one single t; so no need to use async allreduce + + self.scaler.step(self.optimizer) + scaler_sc: Optional[float] = self.scaler.get_scale() + if scaler_sc > self.max_sc: # fp16 will overflow when >65536, so multiply 32768 could be dangerous + # print(f'[fp16 scaling] too large loss scale {scaler_sc}! (clip to {self.max_sc:g})') + self.scaler.update(new_scale=self.max_sc) + else: + self.scaler.update() + try: + scaler_sc = float(math.log2(scaler_sc)) + except Exception as e: + print(f'[scaler_sc = {scaler_sc}]\n' * 15, flush=True) + time.sleep(1) + print(f'[scaler_sc = {scaler_sc}]\n' * 15, flush=True) + raise e + else: + self.optimizer.step() + + if self.late_clipping: + orig_norm: Optional[torch.Tensor] = self.optimizer.global_grad_norm + self.last_orig_norm = orig_norm + # no zero_grad calling here, gonna log those gradients! + return orig_norm, scaler_sc + + def state_dict(self): + return { + 'optimizer': self.optimizer.state_dict() + } if self.scaler is None else { + 'scaler': self.scaler.state_dict(), + 'optimizer': self.optimizer.state_dict() + } + + def load_state_dict(self, state, strict=True): + if self.scaler is not None: + try: self.scaler.load_state_dict(state['scaler']) + except Exception as e: print(f'[fp16 load_state_dict err] {e}') + self.optimizer.load_state_dict(state['optimizer']) diff --git a/Meissonic/InfinityStar/infinity/utils/arg_util.py b/Meissonic/InfinityStar/infinity/utils/arg_util.py new file mode 100644 index 0000000000000000000000000000000000000000..067a7bc5324b0fa1e12e10cce9786879b5bb0159 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/arg_util.py @@ -0,0 +1,385 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import json +import os +import random +import sys +import time +from collections import OrderedDict +from typing import Union + +import numpy as np +import torch +from tap import Tap + +import infinity.utils.dist as dist +from infinity.utils.sequence_parallel import SequenceParallelManager as sp_manager + + +class Args(Tap): + # ================================================================================================================== + # ============================================= Paths and Directories ============================================ + # ================================================================================================================== + local_out_path: str = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'local_output') # Directory to save checkpoints + data_path: str = '' # Path to the image dataset + video_data_path: str = '' # Path to the video dataset + bed: str = '' # Directory to copy checkpoints apart from local_out_path + vae_path: str = '' # Path to the VAE checkpoint + log_txt_path: str = '' # Path to the log file + t5_path: str = '' # Path to the T5 model; if not specified, it will be automatically found + token_cache_dir: str = '' # Directory for token cache + + # ================================================================================================================== + # =============================================== General Training ================================================= + # ================================================================================================================== + exp_name: str = '' # Experiment name + project_name: str = 'infinitystar' # Name of the wandb project + tf32: bool = True # Whether to use TensorFloat32 + auto_resume: bool = True # Whether to automatically resume from the last checkpoint + rush_resume: str = '' # Path to a pretrained infinity checkpoint for rushing resume + rush_omnistore_resume: str = '' # Path to an omnistore pretrained checkpoint for rushing resume + torchshard_resume: str = '' # Path to an torch shard checkpoint resume + log_every_iter: bool = False # Whether to log every iteration + checkpoint_type: str = 'torch' # Type of checkpoint: 'torch' or 'onmistore' + device: str = 'cpu' # Device to use for training ('cpu' or 'cuda') + is_master_node: bool = None # Whether the current node is the master node + epoch: int = 300 # Number of training epochs + log_freq: int = 1 # Logging frequency in stdout + save_model_iters_freq: int = 1000 # Frequency of saving the model in iterations + short_cap_prob: float = 0.2 # Probability of training with short captions + label_smooth: float = 0.0 # Label smoothing factor + cfg: float = 0.1 # Classifier-free guidance dropout probability + rand_uncond: bool = False # Whether to use random, unlearnable unconditional embedding + twoclip_alternatingtraining: int = 0 # Whether to use two-clip alternating training + wp_it: int = 100 # Warm-up iterations + + # ================================================================================================================== + # ===================================================== Model ====================================================== + # ================================================================================================================== + model: str = '' # Model type: 'b' for VAE training, or any other for GPT training + sdpa_mem: bool = True # Whether to use memory-efficient SDPA + rms_norm: bool = False # Whether to use RMS normalization + tau: float = 1 # Tau of self-attention in GPT + tini: float = -1 # Initialization parameters + topp: float = 0.0 # top-p + topk: float = 0.0 # top-k + fused_norm: bool = False # Whether to use fused normalization + flash: bool = False # Whether to use customized flash-attention kernel + use_flex_attn: bool = False # Whether to use flex_attn to speed up training + norm_eps: float = 1e-6 # Epsilon for normalization layers + Ct5: int = 2048 # Feature dimension of the text encoder + simple_text_proj: int = 1 # Whether to use a simple text projection + mask_type: str = 'infinity_elegant_clip20frames_v2' # Self-attention mask type ('var' or 'video_tower') + mask_video_first_frame: int = 0 # Whether to mask the first frame of the video when calculating loss + + use_fsdp_model_ema: int = 0 # Whether to use FSDP model EMA + model_ema_decay: float = 0.9999 # Model EMA decay rate + + rope_type: str = '4d' # RoPE type ('2d', '3d', or '4d') + rope2d_each_sa_layer: int = 1 # Apply RoPE2D to each self-attention layer + rope2d_normalized_by_hw: int = 2 # Apply normalized RoPE2D + add_lvl_embeding_on_first_block: int = 0 # Apply level PE embedding only to the first block + + # ================================================================================================================== + # ================================================== Scale Schedule ============================================= + # ================================================================================================================== + semantic_scales: int = 8 # Number of semantic scales + semantic_scale_dim: int = 16 # Dimension of semantic scales + detail_scale_dim: int = 64 # Dimension of detail scales + use_learnable_dim_proj: int = 0 # Whether to use a learnable dimension projection + detail_scale_min_tokens: int = 80 # Minimum number of tokens for detail scale + pn: str = '' # Pixel numbers, choose from '0.06M', '0.25M', '1M' + scale_schedule: tuple = None # [Automatically set] Scale schedule based on pn + patch_size: int = None # [Automatically set] Patch size based on scale_schedule + dynamic_scale_schedule: str = '' # Dynamic scale schedule for video + min_scale_ind: int = 3 # Minimum scale index for infinity frame pack + max_reweight_value: int = 40 # Clipping value for reweighting + image_scale_repetition: str = '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]' # Repetition for image scales + video_scale_repetition: str = '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]' # Repetition for video scales + inner_scale_boost: int = 0 # Whether to boost inner scales + drop_720p_last_scale: int = 1 # Whether to drop the last scale for 720p + reweight_loss_by_scale: int = 0 # Reweight loss by scale + + # ================================================================================================================== + # ================================================== Optimization ================================================== + # ================================================================================================================== + tlr: float = 2e-5 # Learning rate + grad_clip: float = 5 # Gradient clipping threshold + cdec: bool = False # Whether to decay the grad clip thresholds + opt: str = 'adamw' # Optimizer type ('adamw' or 'lion') + ada: str = '0.9_0.97' # Adam's beta parameters (e.g., '0.9_0.999') + adam_eps: float = 0.0 # Adam's epsilon + fused_adam: bool = True # Whether to use fused Adam optimizer + disable_weight_decay: int = 1 # Whether to disable weight decay on sparse params + fp16: int = 2 # Floating point precision: 1 for fp16, 2 for bf16 + + # ================================================================================================================== + # ====================================================== Data ====================================================== + # ================================================================================================================== + video_fps: int = 16 # Frames per second for video + video_frames: int = 81 # Number of frames per video + video_batch_size: int = 1 # Batch size for video data + workers: int = 16 # Number of dataloader workers + image_batch_size: int = 0 # [Automatically set] Batch size per GPU for image data + ac: int = 1 # Gradient accumulation steps + r_accu: float = 1.0 # [Automatically set] Reciprocal of gradient accumulation + tlen: int = 512 # Truncate text embedding to this length + num_of_label_value: int = 2 # Number of label values (2 for bitwise, 0 for index-wise) + dynamic_resolution_across_gpus: int = 1 # Allow dynamic resolution across GPUs + enable_dynamic_length_prompt: int = 0 # Enable dynamic length prompt during training + use_streaming_dataset: int = 0 # Whether to use a streaming dataset + iterable_data_buffersize: int = 90000 # Buffer size for streaming dataset + image_batches_multiply: float = 1.0 # Multiplier for the number of image batches per epoch + down_size_limit: int = 10000 # Download size limit for videos in MB + addition_pn_list: str = '[]' # Additional pixel number list + video_caption_type: str = 'tarsier2_caption' # Type of video caption to use + only_images4extract_feats: int = 0 # Whether to only extract features for images + train_max_token_len: int = -1 # Maximum token length for training + train_with_var_seq_len: int = 0 # Whether to train with variable sequence length + video_var_len_prob: str = '[30, 30, 30, 5, 3, 2]' # Probability distribution for variable video length + duration_resolution: int = 1 # Resolution for duration + seq_pack_bucket: int = 1000 # Bucket size for sequence packing + drop_long_video: int = 0 # Whether to drop long videos + min_video_frames: int = -1 # Minimum number of video frames + restrict_data_size: int = -1 # Restrict the size of the dataset + allow_less_one_elem_in_seq: int = 0 # Allow sequences with less than one element + train_192pshort: int = 0 # Whether to train with 192p short videos + steps_per_frame: int = 3 # Steps per frame for the video tower + add_motion_score2caption: int = 0 # Whether to prepend motion score to the caption + context_frames: int = 10000 # Context frames for the video tower + cached_video_frames: int = 81 # Number of cached video frames + frames_inner_clip: int = 20 # Number of frames in a clip for infinity frame pack + context_interval: int = 2 # Context interval + context_from_largest_no: int = 1 # Context from the largest number + append_duration2caption: int = 0 # Whether to append duration to the caption + cache_check_mode: int = 0 # Cache check mode + online_t5: bool = True # Whether to use online T5 or load local features + + # ================================================================================================================== + # ============================================= Distributed Training =============================================== + # ================================================================================================================== + enable_hybrid_shard: bool = False # Whether to use hybrid FSDP + inner_shard_degree: int = 8 # Inner degree for FSDP + zero: int = 0 # DeepSpeed ZeRO stage + buck: str = 'chunk' # Module-wise bucketing for FSDP + fsdp_orig: bool = True # Whether to use original FSDP + enable_checkpointing: str = None # Checkpointing strategy: 'full-block', 'self-attn' + pad_to_multiplier: int = 128 # Pad sequence length to a multiplier of this value + sp_size: int = 0 # Sequence parallelism size + fsdp_save_flatten_model: int = 1 # Whether to save the flattened model in FSDP + inject_sync: int = 0 # Whether to inject synchronization + model_init_device: str = 'cuda' # Device for model initialization + fsdp_init_device: str = 'cuda' # Device for FSDP initialization + + # ================================================================================================================== + # ======================================================= VAE ====================================================== + # ================================================================================================================== + vae_type: int = 64 # VAE type (e.g., 16/32/64 for bsq vae quant bits) + fake_vae_input: bool = False # Whether to use fake VAE input for debugging + use_slice: int = 1 # Whether to use slicing for VAE encoding + use_vae_token_cache: int = 1 # Whether to use token cache for VAE + save_vae_token_cache: int = 0 # Whether to save the VAE token cache + allow_online_vae_feature_extraction: int = 1 # Allow online VAE feature extraction + use_text_token_cache: int = 0 # Whether to use text token cache + videovae: int = 10 # Whether to use a video VAE + use_feat_proj: int = 2 # Whether to use feature projection + use_two_stage_lfq: int = 0 # Whether to use two-stage LFQ + casual_multi_scale: int = 0 # Whether to use casual multi-scale + temporal_compress_rate: int = 4 # Temporal compression rate + apply_spatial_patchify: int = 0 # Whether to apply spatial patchify + + + # ================================================================================================================== + # ============================================ Bitwise Self-Correction ============================================= + # ================================================================================================================== + noise_apply_layers: int = 1000 # Apply noise to layers + noise_apply_strength: str = '-1' # Noise strength + noise_apply_requant: int = 1 # Requant after applying noise + noise_apply_random_one: int = 0 # Requant only one scale randomly + debug_bsc: int = 0 # Save figures and set breakpoints for debugging BSC + noise_input: int = 0 # Whether to add noise to the input + reduce_accumulate_error_method: str = 'bsc' # Method to reduce accumulation error + + + + ############################ Attention! The following arguments and configurations are set automatically, you can skip reading the following part ############################### + ############################ Attention! The following arguments and configurations are set automatically, you can skip reading the following part ############################### + ############################ Attention! The following arguments and configurations are set automatically, you can skip reading the following part ############################### + + + # would be automatically set in runtime + branch: str = '' # subprocess.check_output(f'git symbolic-ref --short HEAD 2>/dev/null || git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]' # [automatically set; don't specify this] + commit_id: str = '' # subprocess.check_output(f'git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]' # [automatically set; don't specify this] + commit_msg: str = ''# (subprocess.check_output(f'git log -1', shell=True).decode('utf-8').strip().splitlines() or ['[unknown]'])[-1].strip() # [automatically set; don't specify this] + cmd: str = ' '.join(a.replace('--exp_name=', '').replace('--exp_name ', '') for a in sys.argv[7:]) # [automatically set; don't specify this] + tag: str = 'UK' # [automatically set; don't specify this] + cur_it: str = '' # [automatically set; don't specify this] + MFU: float = None # [automatically set; don't specify this] + HFU: float = None # [automatically set; don't specify this] + # ================================================================================================================== + # ======================== ignore these parts below since they are only for debug use ============================== + # ================================================================================================================== + + dbg: bool = 'KEVIN_LOCAL' in os.environ # only used when debug about unused param in DDP + prof: int = 0 # profile + prof_freq: int = 50 # profile + profall: int = 0 + # ================================================================================================================== + # ======================== ignore these parts above since they are only for debug use ============================== + # ================================================================================================================== + + @property + def gpt_training(self): + return len(self.model) > 0 + + def set_initial_seed(self, benchmark: bool): + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = benchmark + assert self.seed + seed = self.seed + torch.backends.cudnn.deterministic = True + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + def dump_log(self): + if not dist.is_local_master(): + return + nd = {'is_master': dist.is_visualizer()} + for k, v in { + 'name': self.exp_name, + 'tag': self.tag, + 'cmd': self.cmd, + 'commit': self.commit_id, + 'branch': self.branch, + 'cur_it': self.cur_it, + 'last_upd': time.strftime("%Y-%m-%d %H:%M", time.localtime()), + 'opt': self.opt, + 'is_master_node': self.is_master_node, + }.items(): + if hasattr(v, 'item'):v = v.item() + if v is None or (isinstance(v, str) and len(v) == 0): continue + nd[k] = v + + with open(self.log_txt_path, 'w') as fp: + json.dump(nd, fp, indent=2) + + def state_dict(self, key_ordered=True) -> Union[OrderedDict, dict]: + d = (OrderedDict if key_ordered else dict)() + for k in self.class_variables.keys(): + if k not in {'device', 'dbg_ks_fp'}: # these are not serializable + d[k] = getattr(self, k) + return d + + def load_state_dict(self, d: Union[OrderedDict, dict, str]): + if isinstance(d, str): # for compatibility with old version + d: dict = eval('\n'.join([l for l in d.splitlines() if ' 1: + print(f"INFO: sp_size={args.sp_size}") + sp_manager.init_sp(args.sp_size) + + + args.r_accu = 1 / args.ac # gradient accumulation + args.ada = args.ada or ('0.9_0.96' if args.gpt_training else '0.5_0.9') + args.opt = args.opt.lower().strip() + + # gpt args + if args.gpt_training: + assert args.vae_path, 'VAE ckpt must be specified when training GPT' + from infinity.models import alias_dict + if args.model in alias_dict: + args.model = alias_dict[args.model] + + args.log_txt_path = os.path.join(args.local_out_path, 'log.txt') + + args.enable_checkpointing = None if args.enable_checkpointing in [False, 0, "0"] else args.enable_checkpointing + args.enable_checkpointing = "full-block" if args.enable_checkpointing in [True, 1, "1"] else args.enable_checkpointing + assert args.enable_checkpointing in [None, "full-block", "full-attn", "self-attn"], \ + f"only support no-checkpointing or full-block/full-attn checkpointing, but got {args.enable_checkpointing}." + + if len(args.exp_name) == 0: + args.exp_name = os.path.basename(args.bed) or 'test_exp' + + if '-' in args.exp_name: + args.tag, args.exp_name = args.exp_name.split('-', maxsplit=1) + else: + args.tag = 'UK' + + if dist.is_master(): + os.system(f'rm -rf {os.path.join(args.bed, "ready-node*")} {os.path.join(args.local_out_path, "ready-node*")}') + + if args.sdpa_mem: + from torch.backends.cuda import enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_flash_sdp(True) + enable_mem_efficient_sdp(True) + enable_math_sdp(False) + print(args) + if isinstance(args.noise_apply_strength, str): + args.noise_apply_strength = list(map(float, args.noise_apply_strength.split(','))) + elif isinstance(args.noise_apply_strength, float): + args.noise_apply_strength = [args.noise_apply_strength] + return args diff --git a/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/comm.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/comm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99d5627026b82e62b933e9fe7241759922045d7a Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/comm.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/operation.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/operation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac198266f27291a56d6329bf6ac703313444c124 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/operation.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/pg_utils.cpython-310.pyc b/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/pg_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26c4124d79e0b5b2efed96078d3d02ce4c0032a4 Binary files /dev/null and b/Meissonic/InfinityStar/infinity/utils/comm/__pycache__/pg_utils.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/infinity/utils/comm/comm.py b/Meissonic/InfinityStar/infinity/utils/comm/comm.py new file mode 100644 index 0000000000000000000000000000000000000000..28eb3580eb09b94433964950866adf3703fb2719 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/comm/comm.py @@ -0,0 +1,425 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +from typing import Any, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn.functional as F +from einops import rearrange +from torch import Tensor +from torch.distributed import ProcessGroup + +if torch.__version__ >= "2.4.0": + _torch_custom_op_wrapper = torch.library.custom_op + _torch_register_fake_wrapper = torch.library.register_fake +else: + def noop_custom_op_wrapper(name, fn=None, /, *, mutates_args, device_types=None, schema=None): + def wrap(func): + return func + if fn is None: + return wrap + return fn + def noop_register_fake_wrapper(op, fn=None, /, *, lib=None, _stacklevel=1): + def wrap(func): + return func + if fn is None: + return wrap + return fn + _torch_custom_op_wrapper = noop_custom_op_wrapper + _torch_register_fake_wrapper = noop_register_fake_wrapper + + +__sp_comm_group__ = None + +def set_sp_comm_group(group=None): + global __sp_comm_group__ + assert __sp_comm_group__ is None and group is not None + __sp_comm_group__ = group + +def get_sp_comm_group(): + global __sp_comm_group__ + assert __sp_comm_group__ is not None + return __sp_comm_group__ + + +# ====================================================== +# Model +# ====================================================== + + +def model_sharding(model: torch.nn.Module): + global_rank = dist.get_rank() + world_size = dist.get_world_size() + for _, param in model.named_parameters(): + padding_size = (world_size - param.numel() % world_size) % world_size + if padding_size > 0: + padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size]) + else: + padding_param = param.data.view(-1) + splited_params = padding_param.split(padding_param.numel() // world_size) + splited_params = splited_params[global_rank] + param.data = splited_params + + +# ====================================================== +# AllGather & ReduceScatter +# ====================================================== + + +class AsyncAllGatherForTwo(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + weight: Tensor, + bias: Tensor, + sp_rank: int, + sp_size: int, + group: Optional[ProcessGroup] = None, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + from torch.distributed._functional_collectives import all_gather_tensor + + ctx.group = group + ctx.sp_rank = sp_rank + ctx.sp_size = sp_size + + # all gather inputs + all_inputs = all_gather_tensor(inputs.unsqueeze(0), 0, group) + # compute local qkv + local_qkv = F.linear(inputs, weight, bias).unsqueeze(0) + + # remote compute + remote_inputs = all_inputs[1 - sp_rank].view(list(local_qkv.shape[:-1]) + [-1]) + # compute remote qkv + remote_qkv = F.linear(remote_inputs, weight, bias) + + # concat local and remote qkv + if sp_rank == 0: + qkv = torch.cat([local_qkv, remote_qkv], dim=0) + else: + qkv = torch.cat([remote_qkv, local_qkv], dim=0) + qkv = rearrange(qkv, "sp b n c -> b (sp n) c") + + ctx.save_for_backward(inputs, weight, remote_inputs) + return qkv + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + from torch.distributed._functional_collectives import reduce_scatter_tensor + + group = ctx.group + sp_rank = ctx.sp_rank + sp_size = ctx.sp_size + inputs, weight, remote_inputs = ctx.saved_tensors + + # split qkv_grad + qkv_grad = grad_outputs[0] + qkv_grad = rearrange(qkv_grad, "b (sp n) c -> sp b n c", sp=sp_size) + qkv_grad = torch.chunk(qkv_grad, 2, dim=0) + if sp_rank == 0: + local_qkv_grad, remote_qkv_grad = qkv_grad + else: + remote_qkv_grad, local_qkv_grad = qkv_grad + + # compute remote grad + remote_inputs_grad = torch.matmul(remote_qkv_grad, weight).squeeze(0) + weight_grad = torch.matmul(remote_qkv_grad.transpose(-1, -2), remote_inputs).squeeze(0).sum(0) + bias_grad = remote_qkv_grad.squeeze(0).sum(0).sum(0) + + # launch async reduce scatter + remote_inputs_grad_zero = torch.zeros_like(remote_inputs_grad) + if sp_rank == 0: + remote_inputs_grad = torch.cat([remote_inputs_grad_zero, remote_inputs_grad], dim=0) + else: + remote_inputs_grad = torch.cat([remote_inputs_grad, remote_inputs_grad_zero], dim=0) + remote_inputs_grad = reduce_scatter_tensor(remote_inputs_grad, "sum", 0, group) + + # compute local grad and wait for reduce scatter + local_input_grad = torch.matmul(local_qkv_grad, weight).squeeze(0) + weight_grad += torch.matmul(local_qkv_grad.transpose(-1, -2), inputs).squeeze(0).sum(0) + bias_grad += local_qkv_grad.squeeze(0).sum(0).sum(0) + + # sum remote and local grad + inputs_grad = remote_inputs_grad + local_input_grad + return inputs_grad, weight_grad, bias_grad, None, None, None + + +class AllGather(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + group: Optional[ProcessGroup] = None, + overlap: bool = False, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + assert ctx is not None or not overlap + + if ctx is not None: + ctx.comm_grp = group + + comm_size = dist.get_world_size(group) + if comm_size == 1: + return inputs.unsqueeze(0), None + + buffer_shape = (comm_size,) + inputs.shape + outputs = torch.empty(buffer_shape, dtype=inputs.dtype, device=inputs.device) + buffer_list = list(torch.chunk(outputs, comm_size, dim=0)) + if not overlap: + dist.all_gather(buffer_list, inputs, group=group) + return outputs, None + else: + handle = dist.all_gather(buffer_list, inputs, group=group, async_op=True) + return outputs, handle + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + return ( + ReduceScatter.forward(None, grad_outputs[0], ctx.comm_grp, False)[0], + None, + None, + ) + + +class ReduceScatter(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + group: ProcessGroup, + overlap: bool = False, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + assert ctx is not None or not overlap + + if ctx is not None: + ctx.comm_grp = group + + comm_size = dist.get_world_size(group) + if comm_size == 1: + return inputs.squeeze(0), None + + if not inputs.is_contiguous(): + inputs = inputs.contiguous() + + output_shape = inputs.shape[1:] + outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device) + buffer_list = list(torch.chunk(inputs, comm_size, dim=0)) + if not overlap: + dist.reduce_scatter(outputs, buffer_list, group=group) + return outputs, None + else: + handle = dist.reduce_scatter(outputs, buffer_list, group=group, async_op=True) + return outputs, handle + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + # TODO: support async backward + return ( + AllGather.forward(None, grad_outputs[0], ctx.comm_grp, False)[0], + None, + None, + ) + + +# ====================================================== +# AlltoAll +# ====================================================== + + +@_torch_custom_op_wrapper("distributed::_all_to_all_func", mutates_args=(), device_types="cuda") +def _all_to_all_func(input_: torch.Tensor, world_size: int = 1, scatter_dim: int = 0, gather_dim: int = 0) -> torch.Tensor: + input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)] + output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)] + group = get_sp_comm_group() + dist.all_to_all(output_list, input_list, group=group) + return torch.cat(output_list, dim=gather_dim).contiguous() + + +@_torch_register_fake_wrapper("distributed::_all_to_all_func") +def _all_to_all_func_fake(input_: torch.Tensor, world_size: int = 1, scatter_dim: int = 0, gather_dim: int = 0) -> torch.Tensor: + inp_shape = list(input_.shape) + group = get_sp_comm_group() + world_size = dist.get_world_size(group) + if world_size == 1: + return input_ + + inp_shape[gather_dim] = inp_shape[gather_dim] * world_size + inp_shape[scatter_dim] = inp_shape[scatter_dim] // world_size + outputs = torch.empty(torch.Size(inp_shape), dtype=input_.dtype, device=input_.device, layout=input_.layout) + return outputs + + +class _AllToAll(torch.autograd.Function): + """All-to-all communication. + + Args: + input_: input matrix + process_group: communication group + scatter_dim: scatter dimension + gather_dim: gather dimension + """ + + @staticmethod + def forward(ctx, input_, process_group, scatter_dim, gather_dim): + ctx.process_group = process_group + ctx.scatter_dim = scatter_dim + ctx.gather_dim = gather_dim + world_size = dist.get_world_size(process_group) + + return _wrapper_all_to_all_func(input_, world_size, scatter_dim, gather_dim) + + @staticmethod + def backward(ctx, *grad_output): + process_group = ctx.process_group + scatter_dim = ctx.gather_dim + gather_dim = ctx.scatter_dim + return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim) + return (return_grad, None, None, None) + + +def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1): + return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim) + + +# ====================================================== +# Sequence Gather & Split +# ====================================================== + + +def _split_sequence_func(inputs, pg: dist.ProcessGroup, dim=-1): + world_size = dist.get_world_size(pg) + if world_size == 1: + return inputs + + # Split along last dimension. + rank = dist.get_rank(pg) + dim_size = inputs.size(dim) + assert dim_size % world_size == 0, ( + f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), " + f"cannot split tensor evenly" + ) + + outputs = torch.split(inputs, dim_size // world_size, dim=dim)[rank] + return outputs + + +@_torch_custom_op_wrapper("distributed::_gather_sequence_func", mutates_args=(), device_types="cuda") +def _gather_sequence_func(inputs: torch.Tensor, dim: int = -1) -> torch.Tensor: + pg = get_sp_comm_group() + world_size = dist.get_world_size(pg) + if world_size == 1: + return inputs + + # all gather + inputs = inputs.contiguous() + outputs = [torch.empty_like(inputs) for _ in range(world_size)] + dist.all_gather(outputs, inputs, group=pg) + + # concat + outputs = torch.cat(outputs, dim=dim) + return outputs + + +@_torch_register_fake_wrapper("distributed::_gather_sequence_func") +def _gather_sequence_func_fake(inputs: torch.Tensor, dim: int = -1) -> torch.Tensor: + inp_shape = list(inputs.shape) + pg = get_sp_comm_group() + world_size = dist.get_world_size(pg) + if world_size == 1: + return inputs + + inp_shape[dim] = inp_shape[dim] * world_size + outputs = torch.empty(torch.Size(inp_shape), dtype=inputs.dtype, device=inputs.device, layout=inputs.layout) + return outputs + + +if torch.__version__ >= "2.4.0": + _wrapper_all_to_all_func = torch.ops.distributed._all_to_all_func + _wrapper_gather_sequence_func = torch.ops.distributed._gather_sequence_func +else: + _wrapper_all_to_all_func = _all_to_all_func + _wrapper_gather_sequence_func = _gather_sequence_func + + +class _GatherForwardSplitBackward(torch.autograd.Function): + """ + Gather the input sequence. + + Args: + input_: input matrix. + process_group: process group. + dim: dimension + """ + + @staticmethod + def symbolic(graph, input_): + return _wrapper_gather_sequence_func(input_) + + @staticmethod + def forward(ctx, input_, process_group, dim, grad_scale): + ctx.process_group = process_group + ctx.dim = dim + ctx.grad_scale = grad_scale + return _wrapper_gather_sequence_func(input_, dim) + + @staticmethod + def backward(ctx, grad_output): + if ctx.grad_scale == "up": + grad_output = grad_output * dist.get_world_size(ctx.process_group) + elif ctx.grad_scale == "down": + grad_output = grad_output / dist.get_world_size(ctx.process_group) + + return _split_sequence_func(grad_output, ctx.process_group, ctx.dim), None, None, None + + +class _SplitForwardGatherBackward(torch.autograd.Function): + """ + Split sequence. + + Args: + input_: input matrix. + process_group: parallel mode. + dim: dimension + """ + + @staticmethod + def symbolic(graph, input_): + return _split_sequence_func(input_) + + @staticmethod + def forward(ctx, input_, process_group, dim, grad_scale): + ctx.process_group = process_group + ctx.dim = dim + ctx.grad_scale = grad_scale + return _split_sequence_func(input_, process_group, dim) + + @staticmethod + def backward(ctx, grad_output): + if ctx.grad_scale == "up": + grad_output = grad_output * dist.get_world_size(ctx.process_group) + elif ctx.grad_scale == "down": + grad_output = grad_output / dist.get_world_size(ctx.process_group) + return _wrapper_gather_sequence_func(grad_output, ctx.dim), None, None, None + + +def split_sequence(input_, process_group, dim, grad_scale=1.0): + return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale) + + +def gather_sequence(input_, process_group, dim, grad_scale=None): + return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale) diff --git a/Meissonic/InfinityStar/infinity/utils/comm/dist.py b/Meissonic/InfinityStar/infinity/utils/comm/dist.py new file mode 100644 index 0000000000000000000000000000000000000000..009bb238e78cd4b20332cf51c561f1bfbfaaf475 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/comm/dist.py @@ -0,0 +1,191 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import torch +import torch.distributed as dist + + +# ==================== +# All-To-All +# ==================== +def _all_to_all( + input_: torch.Tensor, + world_size: int, + group: dist.ProcessGroup, + scatter_dim: int, + gather_dim: int, +): + input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)] + output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)] + dist.all_to_all(output_list, input_list, group=group) + return torch.cat(output_list, dim=gather_dim).contiguous() + + +class _AllToAll(torch.autograd.Function): + """All-to-all communication. + + Args: + input_: input matrix + process_group: communication group + scatter_dim: scatter dimension + gather_dim: gather dimension + """ + + @staticmethod + def forward(ctx, input_, process_group, scatter_dim, gather_dim): + ctx.process_group = process_group + ctx.scatter_dim = scatter_dim + ctx.gather_dim = gather_dim + ctx.world_size = dist.get_world_size(process_group) + output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim) + return output + + @staticmethod + def backward(ctx, grad_output): + grad_output = _all_to_all( + grad_output, + ctx.world_size, + ctx.process_group, + ctx.gather_dim, + ctx.scatter_dim, + ) + return ( + grad_output, + None, + None, + None, + ) + + +def all_to_all( + input_: torch.Tensor, + process_group: dist.ProcessGroup, + scatter_dim: int = 2, + gather_dim: int = 1, +): + return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim) + + +def _gather( + input_: torch.Tensor, + world_size: int, + group: dist.ProcessGroup, + gather_dim: int, +): + if gather_list is None: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + dist.gather(input_, gather_list, group=group, gather_dim=gather_dim) + return gather_list + + +# ==================== +# Gather-Split +# ==================== + + +def _split(input_, pg: dist.ProcessGroup, dim=-1): + # skip if only one rank involved + world_size = dist.get_world_size(pg) + rank = dist.get_rank(pg) + if world_size == 1: + return input_ + + # Split along last dimension. + dim_size = input_.size(dim) + assert dim_size % world_size == 0, ( + f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), " + f"cannot split tensor evenly" + ) + + tensor_list = torch.split(input_, dim_size // world_size, dim=dim) + output = tensor_list[rank].contiguous() + + return output + + +def _gather(input_, pg: dist.ProcessGroup, dim=-1): + # skip if only one rank involved + input_ = input_.contiguous() + world_size = dist.get_world_size(pg) + dist.get_rank(pg) + + if world_size == 1: + return input_ + + # all gather + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + assert input_.device.type == "cuda" + torch.distributed.all_gather(tensor_list, input_, group=pg) + + # concat + output = torch.cat(tensor_list, dim=dim).contiguous() + + return output + + +class _GatherForwardSplitBackward(torch.autograd.Function): + """Gather the input from model parallel region and concatenate. + + Args: + input_: input matrix. + process_group: parallel mode. + dim: dimension + """ + + @staticmethod + def symbolic(graph, input_): + return _gather(input_) + + @staticmethod + def forward(ctx, input_, process_group, dim, grad_scale): + ctx.mode = process_group + ctx.dim = dim + ctx.grad_scale = grad_scale + return _gather(input_, process_group, dim) + + @staticmethod + def backward(ctx, grad_output): + if ctx.grad_scale == "up": + grad_output = grad_output * dist.get_world_size(ctx.mode) + elif ctx.grad_scale == "down": + grad_output = grad_output / dist.get_world_size(ctx.mode) + + return _split(grad_output, ctx.mode, ctx.dim), None, None, None + + +class _SplitForwardGatherBackward(torch.autograd.Function): + """ + Split the input and keep only the corresponding chuck to the rank. + + Args: + input_: input matrix. + process_group: parallel mode. + dim: dimension + """ + + @staticmethod + def symbolic(graph, input_): + return _split(input_) + + @staticmethod + def forward(ctx, input_, process_group, dim, grad_scale): + ctx.mode = process_group + ctx.dim = dim + ctx.grad_scale = grad_scale + return _split(input_, process_group, dim) + + @staticmethod + def backward(ctx, grad_output): + if ctx.grad_scale == "up": + grad_output = grad_output * dist.get_world_size(ctx.mode) + elif ctx.grad_scale == "down": + grad_output = grad_output / dist.get_world_size(ctx.mode) + return _gather(grad_output, ctx.mode, ctx.dim), None, None, None + + +def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0): + return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale) + + +def gather_forward_split_backward(input_, process_group, dim, grad_scale=None): + return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale) \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/utils/comm/operation.py b/Meissonic/InfinityStar/infinity/utils/comm/operation.py new file mode 100644 index 0000000000000000000000000000000000000000..7660e846d2479d41f4122f99fc9c1395eb0fd64e --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/comm/operation.py @@ -0,0 +1,380 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from typing import Any, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn.functional as F +from einops import rearrange +from torch import Tensor +from torch.distributed import ProcessGroup + + +class AllToAll(torch.autograd.Function): + """Dispatches input tensor [e, c, h] to all experts by all_to_all_single + operation in torch.distributed. + """ + + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + group: ProcessGroup, + overlap: bool = False, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + assert ctx is not None or not overlap + + if ctx is not None: + ctx.comm_grp = group + if not inputs.is_contiguous(): + inputs = inputs.contiguous() + if dist.get_world_size(group) == 1: + return inputs, None + output = torch.empty_like(inputs) + if not overlap: + dist.all_to_all_single(output, inputs, group=group) + return output, None + else: + handle = dist.all_to_all_single(output, inputs, group=group, async_op=True) + return output, handle + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + return ( + AllToAll.forward(None, grad_outputs[0], ctx.comm_grp, False)[0], + None, + None, + ) + + +class AsyncAllGatherForTwo(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + weight: Tensor, + bias: Tensor, + sp_rank: int, + sp_size: int, + group: Optional[ProcessGroup] = None, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + from torch.distributed._functional_collectives import all_gather_tensor + + ctx.group = group + ctx.sp_rank = sp_rank + ctx.sp_size = sp_size + + # all gather inputs + all_inputs = all_gather_tensor(inputs.unsqueeze(0), 0, group) + # compute local qkv + local_qkv = F.linear(inputs, weight, bias).unsqueeze(0) + + # remote compute + remote_inputs = all_inputs[1 - sp_rank].view(list(local_qkv.shape[:-1]) + [-1]) + # compute remote qkv + remote_qkv = F.linear(remote_inputs, weight, bias) + + # concat local and remote qkv + if sp_rank == 0: + qkv = torch.cat([local_qkv, remote_qkv], dim=0) + else: + qkv = torch.cat([remote_qkv, local_qkv], dim=0) + qkv = rearrange(qkv, "sp b n c -> b (sp n) c") + + ctx.save_for_backward(inputs, weight, remote_inputs) + return qkv + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + from torch.distributed._functional_collectives import reduce_scatter_tensor + + group = ctx.group + sp_rank = ctx.sp_rank + sp_size = ctx.sp_size + inputs, weight, remote_inputs = ctx.saved_tensors + + # split qkv_grad + qkv_grad = grad_outputs[0] + qkv_grad = rearrange(qkv_grad, "b (sp n) c -> sp b n c", sp=sp_size) + qkv_grad = torch.chunk(qkv_grad, 2, dim=0) + if sp_rank == 0: + local_qkv_grad, remote_qkv_grad = qkv_grad + else: + remote_qkv_grad, local_qkv_grad = qkv_grad + + # compute remote grad + remote_inputs_grad = torch.matmul(remote_qkv_grad, weight).squeeze(0) + weight_grad = torch.matmul(remote_qkv_grad.transpose(-1, -2), remote_inputs).squeeze(0).sum(0) + bias_grad = remote_qkv_grad.squeeze(0).sum(0).sum(0) + + # launch async reduce scatter + remote_inputs_grad_zero = torch.zeros_like(remote_inputs_grad) + if sp_rank == 0: + remote_inputs_grad = torch.cat([remote_inputs_grad_zero, remote_inputs_grad], dim=0) + else: + remote_inputs_grad = torch.cat([remote_inputs_grad, remote_inputs_grad_zero], dim=0) + remote_inputs_grad = reduce_scatter_tensor(remote_inputs_grad, "sum", 0, group) + + # compute local grad and wait for reduce scatter + local_input_grad = torch.matmul(local_qkv_grad, weight).squeeze(0) + weight_grad += torch.matmul(local_qkv_grad.transpose(-1, -2), inputs).squeeze(0).sum(0) + bias_grad += local_qkv_grad.squeeze(0).sum(0).sum(0) + + # sum remote and local grad + inputs_grad = remote_inputs_grad + local_input_grad + return inputs_grad, weight_grad, bias_grad, None, None, None + + +class AllGather(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + group: Optional[ProcessGroup] = None, + overlap: bool = False, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + assert ctx is not None or not overlap + + if ctx is not None: + ctx.comm_grp = group + + comm_size = dist.get_world_size(group) + # print(f"XW debug, All Gather Dist world size {comm_size}") + if comm_size == 1: + return inputs.unsqueeze(0), None + + buffer_shape = (comm_size,) + inputs.shape + outputs = torch.empty(buffer_shape, dtype=inputs.dtype, device=inputs.device) + buffer_list = list(torch.chunk(outputs, comm_size, dim=0)) + # buffer_list = list([ + # t.squeeze(0) for t in torch.chunk(outputs, comm_size, dim=0) + # ]) + + if not overlap: + # print("buffer list", len(buffer_list), [t.shape for t in buffer_list]) + # print("inputs", inputs.shape, inputs.is_contiguous()) + # print(group) + + dist.all_gather(buffer_list, inputs, group=group) + return outputs, None + else: + handle = dist.all_gather(buffer_list, inputs, group=group, async_op=True) + return outputs, handle + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + return ( + ReduceScatter.forward(None, grad_outputs[0], ctx.comm_grp, False)[0], + None, + None, + ) + + +class ReduceScatter(torch.autograd.Function): + @staticmethod + def forward( + ctx: Any, + inputs: Tensor, + group: ProcessGroup, + overlap: bool = False, + ) -> Tuple[Tensor, Any]: + """ + Returns: + outputs: Tensor + handle: Optional[Work], if overlap is True + """ + assert ctx is not None or not overlap + + if ctx is not None: + ctx.comm_grp = group + + comm_size = dist.get_world_size(group) + if comm_size == 1: + return inputs.squeeze(0), None + + if not inputs.is_contiguous(): + inputs = inputs.contiguous() + + output_shape = inputs.shape[1:] + outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device) + buffer_list = list(torch.chunk(inputs, comm_size, dim=0)) + if not overlap: + dist.reduce_scatter(outputs, buffer_list, group=group) + return outputs, None + else: + handle = dist.reduce_scatter(outputs, buffer_list, group=group, async_op=True) + return outputs, handle + + @staticmethod + def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]: + # TODO: support async backward + return ( + AllGather.forward(None, grad_outputs[0], ctx.comm_grp, False)[0], + None, + None, + ) + + +# using all_to_all_single api to perform all to all communication +def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim): + inp_shape = list(input_.shape) + inp_shape[scatter_dim] = inp_shape[scatter_dim] // seq_world_size + if scatter_dim < 2: + input_t = input_.reshape([seq_world_size, inp_shape[scatter_dim]] + inp_shape[scatter_dim + 1 :]).contiguous() + else: + input_t = ( + input_.reshape([-1, seq_world_size, inp_shape[scatter_dim]] + inp_shape[scatter_dim + 1 :]) + .transpose(0, 1) + .contiguous() + ) + + output = torch.empty_like(input_t) + dist.all_to_all_single(output, input_t, group=group) + + if scatter_dim < 2: + output = output.transpose(0, 1).contiguous() + + return output.reshape( + inp_shape[:gather_dim] + + [ + inp_shape[gather_dim] * seq_world_size, + ] + + inp_shape[gather_dim + 1 :] + ).contiguous() + + +# using all_to_all api to perform all to all communication +def _all_to_all(input_, world_size, group, scatter_dim, gather_dim): + input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)] + output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)] + dist.all_to_all(output_list, input_list, group=group) + return torch.cat(output_list, dim=gather_dim).contiguous() + + +class _AllToAll(torch.autograd.Function): + """All-to-all communication. + + Args: + input_: input matrix + process_group: communication group + scatter_dim: scatter dimension + gather_dim: gather dimension + """ + + @staticmethod + def forward(ctx, input_, process_group, scatter_dim, gather_dim): + ctx.process_group = process_group + ctx.scatter_dim = scatter_dim + ctx.gather_dim = gather_dim + world_size = dist.get_world_size(process_group) + bsz, _, _ = input_.shape + + # Todo: Try to make all_to_all_single compatible with a large batch size + if bsz == 1: + return _all_to_all_single(input_, world_size, process_group, scatter_dim, gather_dim) + else: + return _all_to_all(input_, world_size, process_group, scatter_dim, gather_dim) + + @staticmethod + def backward(ctx, *grad_output): + process_group = ctx.process_group + scatter_dim = ctx.gather_dim + gather_dim = ctx.scatter_dim + return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim) + return (return_grad, None, None, None) + + +def model_sharding(model: torch.nn.Module): + global_rank = dist.get_rank() + world_size = dist.get_world_size() + for _, param in model.named_parameters(): + padding_size = (world_size - param.numel() % world_size) % world_size + if padding_size > 0: + padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size]) + else: + padding_param = param.data.view(-1) + splited_params = padding_param.split(padding_param.numel() // world_size) + splited_params = splited_params[global_rank] + param.data = splited_params + + +def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1): + return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim) + + +def _gather(input_, dim=-1, process_group=None): + # skip if only one rank involved + world_size = dist.get_world_size(process_group) + if world_size == 1: + return input_ + + # all gather + input_ = input_.contiguous() + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + torch.distributed.all_gather(tensor_list, input_, group=process_group) + + # concat + output = torch.cat(tensor_list, dim=dim).contiguous() + + return output + + +def _split(input_, dim=-1, process_group=None): + # skip if only one rank involved + world_size = dist.get_world_size(process_group) + if world_size == 1: + return input_ + + # Split along last dimension. + dim_size = input_.size(dim) + assert dim_size % world_size == 0, ( + f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), " + f"cannot split tensor evenly" + ) + + tensor_list = torch.split(input_, dim_size // world_size, dim=dim) + rank = dist.get_rank(process_group) + output = tensor_list[rank].clone().contiguous() + + return output + + +class _GatherForwardSplitBackward(torch.autograd.Function): + """Gather the input from model parallel region and concatenate. + + Args: + input_: input matrix. + parallel_mode: parallel mode. + dim: dimension + """ + + @staticmethod + def forward(ctx, input_, dim, process_group): + ctx.process_group = process_group + ctx.dim = dim + return _gather(input_, dim, process_group) + + @staticmethod + def backward(ctx, grad_output): + return _split(grad_output, ctx.dim, ctx.process_group), None, None + + +def gather_forward_split_backward(input_, dim, process_group): + return _GatherForwardSplitBackward.apply(input_, dim, process_group) + + diff --git a/Meissonic/InfinityStar/infinity/utils/comm/pg_utils.py b/Meissonic/InfinityStar/infinity/utils/comm/pg_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8daa7480fff4fbcd3c40f866c317e1ec9236e844 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/comm/pg_utils.py @@ -0,0 +1,233 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +# copy from colossalai and opendit +# +import itertools +from functools import reduce +from operator import mul +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import torch.distributed as dist +from torch.distributed import ProcessGroup + + +def prod(nums: List[int]) -> int: + """Product of a list of numbers. + + Args: + nums (List[int]): A list of numbers. + + Returns: + int: The product of the numbers. + """ + return reduce(mul, nums) + + +class ProcessGroupMesh: + """A helper class to manage the process group mesh. It only describes how to organize process groups, and it's decoupled with parallel method. + It just initialize process groups and cache them. The parallel method should manage them and use them to do the parallel computation. + + We use a ND-tuple to represent the process group mesh. And a ND-coordinate is to represent each process. + For example, ``(0, 1, 0)`` represents the process whose rank is 2 in a 3D process group mesh with size ``(2, 2, 2)``. + + Args: + *size (int): The size of each dimension of the process group mesh. The product of the size must be equal to the world size. + + Attributes: + shape (Tuple[int, ...]): The shape of the process group mesh. + rank (int): The rank of the current process. + """ + + def __init__(self, *size: int) -> None: + assert dist.is_initialized(), "Please initialize torch.distributed first." + assert prod(size) == dist.get_world_size(), f"The product of the size must be equal to the world size. However, got {prod(size)} and {dist.get_world_size()}." + self._shape = size + self._rank = dist.get_rank() + self._coord = ProcessGroupMesh.unravel(self._rank, self._shape) + self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {} + self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {} + + @property + def shape(self) -> Tuple[int, ...]: + return self._shape + + @property + def rank(self) -> int: + return self._rank + + def size(self, dim: Optional[int] = None) -> Union[int, Tuple[int, ...]]: + """Get the size of the process group mesh. + + Args: + dim (Optional[int], optional): Dimension of the process group mesh. `None` means all dimensions. Defaults to None. + + Returns: + Union[int, Tuple[int, ...]]: Size of the target dimension or the whole process group mesh. + """ + if dim is None: + return self._shape + else: + return self._shape[dim] + + def coordinate(self, dim: Optional[int] = None) -> Union[int, Tuple[int, ...]]: + """Get the coordinate of the process group mesh. + + Args: + dim (Optional[int], optional): Dimension of the process group mesh. `None` means all dimensions. Defaults to None. + + Returns: + Union[int, Tuple[int, ...]]: Coordinate of the target dimension or the whole process group mesh. + """ + if dim is None: + return self._coord + else: + return self._coord[dim] + + @staticmethod + def unravel(rank: int, shape: Tuple[int, ...]) -> Tuple[int, ...]: + """Convert a rank to a coordinate. + + Args: + rank (int): Rank to be converted. + shape (Tuple[int, ...]): Shape of the process group mesh. + + Returns: + Tuple[int, ...]: Coordinate of the rank. + """ + res = np.unravel_index(rank, shape) + return tuple(int(i) for i in res) + + @staticmethod + def ravel(coord: Tuple[int, ...], shape: Tuple[int, ...], mode: str = "raise") -> int: + """Convert a coordinate to a rank. + mode: ['raise', 'wrap', 'clip'], see https://numpy.org/doc/stable/reference/generated/numpy.ravel_multi_index.html. + with wrap, index out of range would be wrapped around. + For instance, ravel((0, i, 0), (1, 2, 1), 'wrap') returns (i % 2) + + Args: + coords (Tuple[int, ...]): Coordinate to be converted. + shape (Tuple[int, ...]): Shape of the process group mesh. + mode (Optional[str]): The mode for numpy.ravel_multi_index. + + Returns: + int: Rank of the coordinate. + """ + + assert mode in ["raise", "wrap", "clip"] + return int(np.ravel_multi_index(coord, shape, mode)) + + def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) -> ProcessGroup: + """Get the process group with the given ranks. It the process group doesn't exist, it will be created. + + Args: + ranks_in_group (List[int]): Ranks in the process group. + backend (Optional[str], optional): Backend of the process group. Defaults to None. + + Returns: + ProcessGroup: The process group with the given ranks. + """ + ranks_in_group = sorted(ranks_in_group) + if tuple(ranks_in_group) not in self._group_to_ranks: + group = dist.new_group(ranks_in_group, backend=backend) + self._ranks_to_group[tuple(ranks_in_group)] = group + self._group_to_ranks[group] = tuple(ranks_in_group) + return self._ranks_to_group[tuple(ranks_in_group)] + + def get_ranks_in_group(self, group: ProcessGroup) -> List[int]: + """Get the ranks in the given process group. The process group must be created by this class. + + Args: + group (ProcessGroup): The process group. + + Returns: + List[int]: Ranks in the process group. + """ + return list(self._group_to_ranks[group]) + + @staticmethod + def get_coords_along_axis( + base_coord: Tuple[int, ...], axis: int, indices_at_axis: List[int] + ) -> List[Tuple[int, ...]]: + """Get coordinates along the given axis. + + Args: + base_coord (Tuple[int, ...]): Base coordinate which the coordinates along the axis are based on. + axis (int): Axis along which the coordinates are generated. + indices_at_axis (List[int]): Indices at the axis. + + Returns: + List[Tuple[int, ...]]: Coordinates along the axis. + """ + coords_in_group = [] + for idx in indices_at_axis: + coords_in_group.append(base_coord[:axis] + (idx,) + base_coord[axis + 1 :]) + return coords_in_group + + def create_group_along_axis( + self, axis: int, indices_at_axis: Optional[List[int]] = None, backend: Optional[str] = None + ) -> ProcessGroup: + """Create all process groups along the given axis, and return the one which the current process belongs to. + + Args: + axis (int): Axis along which the process groups are created. + indices_at_axis (Optional[List[int]], optional): Indices at the axis. Defaults to None. + backend (Optional[str], optional): Backend of the process group. Defaults to None. + + Returns: + ProcessGroup: The process group along the given axis which the current process belongs to. + """ + indices_at_axis = indices_at_axis or list(range(self._shape[axis])) + reduced_shape = list(self._shape) + # the choices on the axis are reduced to 1, since it's determined by `indices_at_axis` + reduced_shape[axis] = 1 + target_group = None + # use Cartesian product to generate all combinations of coordinates + for base_coord in itertools.product(*[range(s) for s in reduced_shape]): + coords_in_group = ProcessGroupMesh.get_coords_along_axis(base_coord, axis, indices_at_axis) + ranks_in_group = tuple([ProcessGroupMesh.ravel(coord, self._shape) for coord in coords_in_group]) + group = self.get_group(ranks_in_group, backend=backend) + if self._rank in ranks_in_group: + target_group = group + return target_group + + def get_group_along_axis( + self, axis: int, indices_at_axis: Optional[List[int]] = None, backend: Optional[str] = None + ) -> ProcessGroup: + """Get the process group along the given axis which the current process belongs to. If the process group doesn't exist, it will be created. + + Args: + axis (int): Axis along which the process groups are created. + indices_at_axis (Optional[List[int]], optional): Indices at the axis. Defaults to None. + backend (Optional[str], optional): Backend of the process group. Defaults to None. + + Returns: + ProcessGroup: The process group along the given axis which the current process belongs to. + """ + indices_at_axis = indices_at_axis or list(range(self._shape[axis])) + coords_in_group = ProcessGroupMesh.get_coords_along_axis(self._coord, axis, indices_at_axis) + ranks_in_group = tuple([ProcessGroupMesh.ravel(coord, self._shape) for coord in coords_in_group]) + if ranks_in_group not in self._ranks_to_group: + # no need to cache it explicitly, since it will be cached in `create_group_along_axis` + return self.create_group_along_axis(axis, indices_at_axis, backend=backend) + return self._ranks_to_group[ranks_in_group] + +from torch.distributed import ProcessGroup + + +class ProcessGroupManager(ProcessGroupMesh): + def __init__(self, *size: int, dp_axis, sp_axis): + super().__init__(*size) + self.dp_axis = dp_axis + self.sp_axis = sp_axis + self._dp_group: ProcessGroup = self.get_group_along_axis(self.dp_axis) + self._sp_group: ProcessGroup = self.get_group_along_axis(self.sp_axis) + + @property + def dp_group(self) -> ProcessGroup: + return self._dp_group + + @property + def sp_group(self) -> ProcessGroup: + return self._sp_group diff --git a/Meissonic/InfinityStar/infinity/utils/csv_util.py b/Meissonic/InfinityStar/infinity/utils/csv_util.py new file mode 100644 index 0000000000000000000000000000000000000000..a4b1d9b194fb2511f4d97cb55ddf5d8b3cf2e6ee --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/csv_util.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +import os.path as osp +import csv + +def write_dicts2csv_file(input_dict_list, csv_filename): + os.makedirs(osp.dirname(csv_filename), exist_ok=True) + with open(csv_filename, mode='w', newline='', encoding='utf-8') as file: + fieldnames = input_dict_list[0].keys() + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(input_dict_list) + print(f'"{csv_filename}" has been written.') + +def load_csv_as_dicts(csv_filename): + with open(csv_filename, mode='r', newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + return list(reader) diff --git a/Meissonic/InfinityStar/infinity/utils/dist.py b/Meissonic/InfinityStar/infinity/utils/dist.py new file mode 100644 index 0000000000000000000000000000000000000000..973ba10a950dd770f997746ef0be4d923b273252 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/dist.py @@ -0,0 +1,326 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import datetime +import functools +import os +import sys +from typing import List +from typing import Union + +import pytz +import torch +import torch.distributed as tdist + +__rank, __local_rank, __world_size, __device = 0, 0, 1, 'cpu' +__rank_str_zfill = '0' +__initialized = False + + +def initialized(): + return __initialized + + +def __initialize(fork=False, backend='nccl', gpu_id_if_not_distibuted=0, timeout_minutes=30): + global __device + if not torch.cuda.is_available(): + print(f'[dist initialize] cuda is not available, use cpu instead', file=sys.stderr) + return + elif 'RANK' not in os.environ: + torch.cuda.set_device(gpu_id_if_not_distibuted) + __device = torch.empty(1).cuda().device + print(f'[dist initialize] env variable "RANK" is not set, use {__device} as the device', file=sys.stderr) + return + # then 'RANK' must exist + global_rank, num_gpus = int(os.environ['RANK']), torch.cuda.device_count() + local_rank = global_rank % num_gpus + torch.cuda.set_device(local_rank) + + # ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29 + """ + if mp.get_start_method(allow_none=True) is None: + method = 'fork' if fork else 'spawn' + print(f'[dist initialize] mp method={method}') + mp.set_start_method(method) + """ + tdist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout_minutes * 60)) + + global __rank, __local_rank, __world_size, __initialized, __rank_str_zfill + __local_rank = local_rank + __rank, __world_size = tdist.get_rank(), tdist.get_world_size() + __rank_str_zfill = str(__rank).zfill(len(str(__world_size))) + __device = torch.device(local_rank) + __initialized = True + + assert tdist.is_initialized(), 'torch.distributed is not initialized!' + print(f'[lrk={get_local_rank()}, rk={get_rank()}]') + + +def get_rank(): + return __rank + + +def get_rank_given_group(group: tdist.ProcessGroup): + return tdist.get_rank(group=group) + + +def get_rank_str_zfill(): + return __rank_str_zfill + + +def get_local_rank(): + return __local_rank + + +def get_world_size(): + return __world_size + + +def get_device(): + return __device + + +def set_gpu_id(gpu_id: int): + if gpu_id is None: return + global __device + if isinstance(gpu_id, (str, int)): + torch.cuda.set_device(int(gpu_id)) + __device = torch.empty(1).cuda().device + else: + raise NotImplementedError + + +def is_master(): + return __rank == 0 + + +def is_local_master(): + return __local_rank == 0 + + +def is_visualizer(): + return __rank == 0 + # return __rank == max(__world_size - 8, 0) + + +def parallelize(net, syncbn=False): + if syncbn: + net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) + net = net.cuda() + net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[get_local_rank()], find_unused_parameters=False, broadcast_buffers=False) + return net + + +def new_group(ranks: List[int]): + if __initialized: + return tdist.new_group(ranks=ranks) + return None + + +def new_local_machine_group(): + if __initialized: + cur_subgroup, subgroups = tdist.new_subgroups() + return cur_subgroup + return None + + +def barrier(): + if __initialized: + tdist.barrier() + + +def allreduce(t: torch.Tensor, async_op=False): + if __initialized: + if not t.is_cuda: + cu = t.detach().cuda() + ret = tdist.all_reduce(cu, async_op=async_op) + t.copy_(cu.cpu()) + else: + ret = tdist.all_reduce(t, async_op=async_op) + return ret + return None + + +def allgather(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]: + if __initialized: + if not t.is_cuda: + t = t.cuda() + ls = [torch.empty_like(t) for _ in range(__world_size)] + tdist.all_gather(ls, t) + else: + ls = [t] + if cat: + ls = torch.cat(ls, dim=0) + return ls + + +def allgather_diff_shape(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]: + if __initialized: + if not t.is_cuda: + t = t.cuda() + + t_size = torch.tensor(t.size(), device=t.device) + ls_size = [torch.empty_like(t_size) for _ in range(__world_size)] + tdist.all_gather(ls_size, t_size) + + max_B = max(size[0].item() for size in ls_size) + pad = max_B - t_size[0].item() + if pad: + pad_size = (pad, *t.size()[1:]) + t = torch.cat((t, t.new_empty(pad_size)), dim=0) + + ls_padded = [torch.empty_like(t) for _ in range(__world_size)] + tdist.all_gather(ls_padded, t) + ls = [] + for t, size in zip(ls_padded, ls_size): + ls.append(t[:size[0].item()]) + else: + ls = [t] + if cat: + ls = torch.cat(ls, dim=0) + return ls + + +def broadcast(t: torch.Tensor, src_rank) -> None: + if __initialized: + if not t.is_cuda: + cu = t.detach().cuda() + tdist.broadcast(cu, src=src_rank) + t.copy_(cu.cpu()) + else: + tdist.broadcast(t, src=src_rank) + + +def dist_fmt_vals(val: float, fmt: Union[str, None] = '%.2f') -> Union[torch.Tensor, List]: + if not initialized(): + return torch.tensor([val]) if fmt is None else [fmt % val] + + ts = torch.zeros(__world_size) + ts[__rank] = val + allreduce(ts) + if fmt is None: + return ts + return [fmt % v for v in ts.cpu().numpy().tolist()] + + +def master_only(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + force = kwargs.pop('force', False) + if force or is_master(): + ret = func(*args, **kwargs) + else: + ret = None + barrier() + return ret + return wrapper + + +def local_master_only(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + force = kwargs.pop('force', False) + if force or is_local_master(): + ret = func(*args, **kwargs) + else: + ret = None + barrier() + return ret + return wrapper + + +def for_visualize(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if is_visualizer(): + # with torch.no_grad(): + ret = func(*args, **kwargs) + else: + ret = None + return ret + return wrapper + + +def finalize(): + if __initialized: + tdist.destroy_process_group() + + +def init_distributed_mode(local_out_path, fork=False, only_sync_master=False, timeout_minutes=30): + try: + __initialize(fork=fork, timeout_minutes=timeout_minutes) + barrier() + except RuntimeError as e: + print(f'{"!"*80} dist init error (NCCL Error?), stopping training! {"!"*80}', flush=True) + raise e + + if local_out_path is not None: os.makedirs(local_out_path, exist_ok=True) + _change_builtin_print(is_local_master()) + if (is_master() if only_sync_master else is_local_master()) and local_out_path is not None and len(local_out_path): + sys.stdout, sys.stderr = BackupStreamToFile(local_out_path, for_stdout=True), BackupStreamToFile(local_out_path, for_stdout=False) + + +def _change_builtin_print(is_master): + import builtins as __builtin__ + + builtin_print = __builtin__.print + if type(builtin_print) != type(open): + return + + def prt(*args, **kwargs): + force = kwargs.pop('force', False) + clean = kwargs.pop('clean', False) + deeper = kwargs.pop('deeper', False) + if is_master or force: + if not clean: + f_back = sys._getframe().f_back + if deeper and f_back.f_back is not None: + f_back = f_back.f_back + file_desc = f'{f_back.f_code.co_filename:24s}'[-24:] + time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]') + builtin_print(f'{time_str} ({file_desc}, line{f_back.f_lineno:-4d})=>', *args, **kwargs) + else: + builtin_print(*args, **kwargs) + + __builtin__.print = prt + + +class BackupStreamToFile(object): + def __init__(self, local_output_dir, for_stdout=True): + self.for_stdout = for_stdout + self.terminal_stream = sys.stdout if for_stdout else sys.stderr + fname = os.path.join(local_output_dir, 'b1_stdout.txt' if for_stdout else 'b2_stderr.txt') + existing = os.path.exists(fname) + self.file_stream = open(fname, 'a') + if existing: + time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]') + self.file_stream.write('\n'*7 + '='*55 + f' RESTART {time_str} ' + '='*55 + '\n') + self.file_stream.flush() + os.system(f'ln -s {fname} /opt/tiger/run_trial/ >/dev/null 2>&1') + self.enabled = True + + def write(self, message): + self.terminal_stream.write(message) + self.file_stream.write(message) + + def flush(self): + self.terminal_stream.flush() + self.file_stream.flush() + + def isatty(self): + return True + + def close(self): + if not self.enabled: + return + self.enabled = False + self.file_stream.flush() + self.file_stream.close() + if self.for_stdout: + sys.stdout = self.terminal_stream + sys.stdout.flush() + else: + sys.stderr = self.terminal_stream + sys.stderr.flush() + + def __del__(self): + self.close() diff --git a/Meissonic/InfinityStar/infinity/utils/load.py b/Meissonic/InfinityStar/infinity/utils/load.py new file mode 100644 index 0000000000000000000000000000000000000000..379b1810096285a18aef388b1abf519a5428fdff --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/load.py @@ -0,0 +1,66 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +#!/usr/bin/python3 +import torch + +from infinity.models import Infinity +from infinity.utils import arg_util + +def load_visual_tokenizer(args, device=None): + if not device: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if args.vae_type in [8,12,14,16,18,20,24,32,48,64,128]: + schedule_mode = "dynamic" + codebook_dim = args.vae_type # 18 + print(f'Load VAE from {args.vae_path}') + + if args.videovae == 10: # absorb patchify + from infinity.models.videovae.models.load_vae_bsq_wan_absorb_patchify import video_vae_model + vae_local = video_vae_model(args.vae_path, schedule_mode, codebook_dim, global_args=args, test_mode=True).to(device) + else: + raise ValueError(f"vae_type {args.vae_type} not supported") + else: + raise ValueError(f"vae_type {args.vae_type} not supported") + return vae_local + +def build_vae_gpt(args: arg_util.Args, force_flash=False, device='cuda'): + vae_local = load_visual_tokenizer(args, device) + + if force_flash: args.flash = True + gpt_kw = dict( + text_channels=args.Ct5, + text_maxlen=args.tlen, + norm_eps=args.norm_eps, + rms_norm=args.rms_norm, + cond_drop_rate=args.cfg, + rand_uncond=args.rand_uncond, + raw_scale_schedule=args.scale_schedule, + top_p=args.topp, + top_k=args.topk, + checkpointing=args.enable_checkpointing, + pad_to_multiplier=args.pad_to_multiplier, + use_flex_attn=args.use_flex_attn, + add_lvl_embeding_on_first_block=args.add_lvl_embeding_on_first_block, + num_of_label_value=args.num_of_label_value, + rope2d_each_sa_layer=args.rope2d_each_sa_layer, + rope2d_normalized_by_hw=args.rope2d_normalized_by_hw, + pn=args.pn, + train_h_div_w_list=None, + apply_spatial_patchify=args.apply_spatial_patchify, + video_frames=args.video_frames, + other_args=args, + ) + + print(f'[create gpt_wo_ddp] constructor kw={gpt_kw}\n') + gpt_kw['vae_local'] = vae_local + + model_str = args.model.replace('vgpt', 'infinity') # legacy + print(f"{model_str=}") + if model_str.rsplit('c', maxsplit=1)[-1].isdecimal(): + model_str, _ = model_str.rsplit('c', maxsplit=1) + from timm.models import create_model + gpt_wo_ddp: Infinity = create_model(model_str, **gpt_kw) + vae_local = vae_local.to('cuda') + assert all(not p.requires_grad for p in vae_local.parameters()) + assert all(p.requires_grad for n, p in gpt_wo_ddp.named_parameters()) + return vae_local, gpt_wo_ddp diff --git a/Meissonic/InfinityStar/infinity/utils/lr_control.py b/Meissonic/InfinityStar/infinity/utils/lr_control.py new file mode 100644 index 0000000000000000000000000000000000000000..9f46e945182210021f730cec62236411d86b222d --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/lr_control.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import math +from pprint import pformat +from typing import Tuple, List, Dict, Union + +import torch.nn +import infinity.utils.dist as dist + +def filter_params(model, ndim_dict, nowd_keys=(), lr_scale=0.0) -> Tuple[ + List[str], List[torch.nn.Parameter], List[Dict[str, Union[torch.nn.Parameter, float]]] +]: + with_lr_scale = hasattr(model, 'get_layer_id_and_scale_exp') and 0 < lr_scale <= 1 + print(f'[get_param_groups][lr decay] with_lr_scale={with_lr_scale}, lr_scale={lr_scale}') + para_groups, para_groups_dbg = {}, {} + names, paras = [], [] + names_no_grad = [] + count, numel = 0, 0 + for name, para in model.named_parameters(): + name = name.replace('_fsdp_wrapped_module.', '') + if not para.requires_grad: + names_no_grad.append(name) + continue # frozen weights + count += 1 + numel += para.numel() + names.append(name) + paras.append(para) + + if ndim_dict.get(name, 2) == 1 or name.endswith('bias') or any(k in name for k in nowd_keys): + cur_wd_sc, group_name = 0., 'ND' + # elif any(k in name for k in small_wd_keys): + # cur_wd_sc, group_name = small_wd, 'small_decay' + else: + cur_wd_sc, group_name = 1., 'D' + + if with_lr_scale: + layer_id, scale_exp = model.get_layer_id_and_scale_exp(name) + group_name = f'layer{layer_id}_' + group_name + cur_lr_sc = lr_scale ** scale_exp + dbg = f'[layer {layer_id}][sc = {lr_scale} ** {scale_exp}]' + else: + cur_lr_sc = 1. + dbg = f'[no scale]' + + if group_name not in para_groups: + para_groups[group_name] = {'params': [], 'wd_sc': cur_wd_sc, 'lr_sc': cur_lr_sc} + para_groups_dbg[group_name] = {'params': [], 'wd_sc': cur_wd_sc, 'lr_sc': dbg} + para_groups[group_name]['params'].append(para) + para_groups_dbg[group_name]['params'].append(name) + + for g in para_groups_dbg.values(): + g['params'] = pformat(', '.join(g['params']), width=200) + + print(f'[get_param_groups] param_groups = \n{pformat(para_groups_dbg, indent=2, width=240)}\n') + + for rk in range(dist.get_world_size()): + dist.barrier() + if dist.get_rank() == rk: + print(f'[get_param_groups][rank{dist.get_rank()}] {type(model).__name__=} {count=}, {numel=}', flush=True, force=True) + print('') + + assert len(names_no_grad) == 0, f'[get_param_groups] names_no_grad = \n{pformat(names_no_grad, indent=2, width=240)}\n' + del ndim_dict + return names, paras, list(para_groups.values()) \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/README.md b/Meissonic/InfinityStar/infinity/utils/mfu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..042d4691cc40a65b61e33b176a868deed2f6f523 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/README.md @@ -0,0 +1,14 @@ +# Usage experience + +```python + +mfutool.setup(5,-1) +mfutool.add(model) +mfutool.enable() + +mfutool.step() +mfutool = mfutool.get_mfu() +flops_detail_info = mfutool.get_flops_detail_info() + +mfutool.disable() +``` diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/custom_flops_impl.py b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/custom_flops_impl.py new file mode 100644 index 0000000000000000000000000000000000000000..cd4b32f8dd2d26562470104da5a7709f32ce3a45 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/custom_flops_impl.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch + +def custom_rmsnorm_forward_hook(module, args, kwargs, output): + if module.training and not torch.is_grad_enabled(): + return + + flops = 0 + hidden_states = args[0] + if len(hidden_states.shape) == 2: + # navit mode + bsz = 1 + seq_len = hidden_states.shape[0] + else: + bsz = hidden_states.shape[0] + seq_len = hidden_states.shape[1] + + flops = bsz * seq_len * (2 * getattr(module, "hidden_size") + 1) * 2 + module.__flops__ += int(flops) * (3 if module.training else 1) + +def custom_goku_attention_forward_hook(module, args, kwargs, output): + if module.training and not torch.is_grad_enabled(): + return + + flops = 0 + inputs_q = kwargs["inputs_q"] + inputs_kv = kwargs["inputs_kv"] if kwargs["inputs_kv"] is not None else inputs_q + + if len(inputs_q.shape) == 2: + # navit mode + q_bsz = kv_bsz = 1 + q_len = inputs_q.shape[0] + kv_len = inputs_kv.shape[0] + + cu_seqlens_q = kwargs["cu_seqlens_q"].to(torch.int64).cpu().numpy() + cu_seqlens_k = kwargs["cu_seqlens_k"].to(torch.int64).cpu().numpy() + + attn_seq_coef = 0 + for i in range(len(cu_seqlens_q) - 1): + seqlen_q = cu_seqlens_q[i + 1] - cu_seqlens_q[i] + seqlen_k = cu_seqlens_k[i + 1] - cu_seqlens_k[i] + attn_seq_coef += seqlen_q * seqlen_k + else: + q_bsz = inputs_q.shape[0] + q_len = inputs_q.shape[1] + kv_bsz = inputs_kv.shape[0] + kv_len = inputs_kv.shape[1] + attn_seq_coef = q_len * kv_len + + sp_size = getattr(module, "sequence_parallel_size", 1) or 1 + num_heads = getattr(module, "num_heads") + head_dim = getattr(module, "head_dim") + + flops = q_bsz * num_heads * attn_seq_coef * head_dim * 2 * 2 // sp_size + + module.__flops__ += int(flops) * (3 if module.training else 1) + +def custom_flex_attention_forward_hook(module, args, kwargs, output): + if module.training and not torch.is_grad_enabled(): + return + + flops = 0 + + q = args[0] + k = args[1] + + q_bs, q_head, q_len ,q_dim = q.shape + kv_bs, kv_head, kv_len ,kv_dim = k.shape + + block_mask = getattr(module, "block_mask") + density = 1 + if block_mask: + # ref: https://gist.github.com/Chillee/2e270fc5413dbbce58c779f8c4eac66c + density = (100 - block_mask.sparsity())/100 + + flops = density * q_bs * q_head * q_dim * q_len * kv_len * 2 * 2 + + module.__flops__ += int(flops) * (3 if module.training else 1) + + +CUSTOM_HOOK_MAPPING = {} +CUSTOM_NAME_MAPPING = {} + +try: + from infinity.models.flex_attn import FlexAttn + CUSTOM_HOOK_MAPPING[FlexAttn] = custom_flex_attention_forward_hook +except: + print(f"[WARN] cannot import custom modules: FlexAttn") + diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/func_flops_impl.py b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/func_flops_impl.py new file mode 100644 index 0000000000000000000000000000000000000000..8288ec549f5f2154ac7242e1e8eaea4d950ef74d --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/func_flops_impl.py @@ -0,0 +1,253 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +from typing import List, Optional + +Tensor = torch.Tensor + +def _prod(dims): + p = 1 + for v in dims: + p *= v + return p + + +def linear_flops_compute(input, weight, bias=None): + out_features = weight.shape[0] + macs = input.numel() * out_features + return 2 * macs, macs + + +def relu_flops_compute(input, inplace=False): + return input.numel(), 0 + + +def prelu_flops_compute(input: Tensor, weight: Tensor): + return input.numel(), 0 + + +def elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False): + return input.numel(), 0 + + +def leaky_relu_flops_compute(input: Tensor, negative_slope: float = 0.01, inplace: bool = False): + return input.numel(), 0 + + +def relu6_flops_compute(input: Tensor, inplace: bool = False): + return input.numel(), 0 + + +def silu_flops_compute(input: Tensor, inplace: bool = False): + return input.numel(), 0 + + +def gelu_flops_compute(input, **kwargs): + return input.numel(), 0 + + +def pool_flops_compute(input, + kernel_size, + stride=None, + padding=0, + dilation=None, + ceil_mode=False, + count_include_pad=True, + divisor_override=None, + return_indices=None): + return input.numel(), 0 + + +def conv_flops_compute(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1): + assert weight.shape[1] * groups == input.shape[1] + + batch_size = input.shape[0] + in_channels = input.shape[1] + out_channels = weight.shape[0] + kernel_dims = list(weight.shape[2:]) + input_dims = list(input.shape[2:]) + + length = len(input_dims) + + strides = stride if type(stride) is tuple else (stride, ) * length + dilations = dilation if type(dilation) is tuple else (dilation, ) * length + if isinstance(padding, str): + if padding == 'valid': + paddings = (0, ) * length + elif padding == 'same': + paddings = () + for d, k in zip(dilations, kernel_dims): + total_padding = d * (k - 1) + paddings += (total_padding // 2, ) + elif isinstance(padding, tuple): + paddings = padding + else: + paddings = (padding, ) * length + + output_dims = [] + for idx, input_dim in enumerate(input_dims): + output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] * + (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1 + output_dims.append(output_dim) + + filters_per_channel = out_channels // groups + conv_per_position_macs = int(_prod(kernel_dims)) * in_channels * filters_per_channel + active_elements_count = batch_size * int(_prod(output_dims)) + overall_conv_macs = conv_per_position_macs * active_elements_count + overall_conv_flops = 2 * overall_conv_macs + + bias_flops = 0 + if bias is not None: + bias_flops = out_channels * active_elements_count + + return int(overall_conv_flops + bias_flops), int(overall_conv_macs) + + +def conv_trans_flops_compute( + input, + weight, + bias=None, + stride=1, + padding=0, + output_padding=0, + groups=1, + dilation=1, +): + batch_size = input.shape[0] + in_channels = input.shape[1] + out_channels = weight.shape[1] + kernel_dims = list(weight.shape[2:]) + input_dims = list(input.shape[2:]) + + length = len(input_dims) + + paddings = padding if type(padding) is tuple else (padding, ) * length + strides = stride if type(stride) is tuple else (stride, ) * length + dilations = dilation if type(dilation) is tuple else (dilation, ) * length + + output_dims = [] + for idx, input_dim in enumerate(input_dims): + + output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] * + (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1 + output_dims.append(output_dim) + + paddings = padding if type(padding) is tuple else (padding, padding) + strides = stride if type(stride) is tuple else (stride, stride) + dilations = dilation if type(dilation) is tuple else (dilation, dilation) + + filters_per_channel = out_channels // groups + conv_per_position_macs = int(_prod(kernel_dims)) * in_channels * filters_per_channel + active_elements_count = batch_size * int(_prod(input_dims)) + overall_conv_macs = conv_per_position_macs * active_elements_count + overall_conv_flops = 2 * overall_conv_macs + + bias_flops = 0 + if bias is not None: + bias_flops = out_channels * batch_size * int(_prod(output_dims)) + + return int(overall_conv_flops + bias_flops), int(overall_conv_macs) + + +def batch_norm_flops_compute( + input, + running_mean, + running_var, + weight=None, + bias=None, + training=False, + momentum=0.1, + eps=1e-05, +): + has_affine = weight is not None + if training: + # estimation + return input.numel() * (5 if has_affine else 4), 0 + flops = input.numel() * (2 if has_affine else 1) + return flops, 0 + + +def layer_norm_flops_compute( + input: Tensor, + normalized_shape: List[int], + weight: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + eps: float = 1e-5, +): + has_affine = weight is not None + # estimation + return input.numel() * (5 if has_affine else 4), 0 + + +def group_norm_flops_compute(input: Tensor, + num_groups: int, + weight: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + eps: float = 1e-5): + has_affine = weight is not None + # estimation + return input.numel() * (5 if has_affine else 4), 0 + + +def instance_norm_flops_compute( + input: Tensor, + running_mean: Optional[Tensor] = None, + running_var: Optional[Tensor] = None, + weight: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + use_input_stats: bool = True, + momentum: float = 0.1, + eps: float = 1e-5, +): + has_affine = weight is not None + # estimation + return input.numel() * (5 if has_affine else 4), 0 + + +def upsample_flops_compute(*args, **kwargs): + input = args[0] + size = kwargs.get('size', None) + if size is None and len(args) > 1: + size = args[1] + + if size is not None: + if isinstance(size, tuple) or isinstance(size, list): + return int(_prod(size)), 0 + else: + return int(size), 0 + + scale_factor = kwargs.get('scale_factor', None) + if scale_factor is None and len(args) > 2: + scale_factor = args[2] + assert scale_factor is not None, "either size or scale_factor should be defined" + + flops = input.numel() + if isinstance(scale_factor, tuple) and len(scale_factor) == len(input): + flops *= int(_prod(scale_factor)) + else: + flops *= scale_factor**len(input) + return flops, 0 + + +def softmax_flops_compute(input, dim=None, _stacklevel=3, dtype=None): + return input.numel(), 0 + + +def embedding_flops_compute( + input, + weight, + padding_idx=None, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + sparse=False, +): + return 0, 0 + +def attn_flops_compute(query, key, value, *args, **kwargs): + """ + Count flops for the scaled_dot_product_attention operation. + """ + macs = _prod(q.shape) * k.shape[-2] + macs += _prod(q.shape[:-1]) * k.shape[-2] * v.shape[-1] + return 2 * macs, macs diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/nn_flops_impl.py b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/nn_flops_impl.py new file mode 100644 index 0000000000000000000000000000000000000000..266383a397c5cd2d7090bb37b2ccbbf01497eac0 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/nn_flops_impl.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch.nn as nn + +def rnn_flops(flops, rnn_module, w_ih, w_hh, input_size): + gates_size = w_ih.shape[0] + # matrix matrix mult ih state and internal state + flops += 2 * w_ih.shape[0] * w_ih.shape[1] - gates_size + # matrix matrix mult hh state and internal state + flops += 2 * w_hh.shape[0] * w_hh.shape[1] - gates_size + if isinstance(rnn_module, (nn.RNN, nn.RNNCell)): + # add both operations + flops += rnn_module.hidden_size + elif isinstance(rnn_module, (nn.GRU, nn.GRUCell)): + # hadamard of r + flops += rnn_module.hidden_size + # adding operations from both states + flops += rnn_module.hidden_size * 3 + # last two hadamard _product and add + flops += rnn_module.hidden_size * 3 + elif isinstance(rnn_module, (nn.LSTM, nn.LSTMCell)): + # adding operations from both states + flops += rnn_module.hidden_size * 4 + # two hadamard _product and add for C state + flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size + # final hadamard + flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size + return flops + + +def rnn_forward_hook(rnn_module, input, output): + flops = 0 + # input is a tuple containing a sequence to process and (optionally) hidden state + inp = input[0] + batch_size = inp.shape[0] + seq_length = inp.shape[1] + num_layers = rnn_module.num_layers + + for i in range(num_layers): + w_ih = rnn_module.__getattr__("weight_ih_l" + str(i)) + w_hh = rnn_module.__getattr__("weight_hh_l" + str(i)) + if i == 0: + input_size = rnn_module.input_size + else: + input_size = rnn_module.hidden_size + flops = rnn_flops(flops, rnn_module, w_ih, w_hh, input_size) + if rnn_module.bias: + b_ih = rnn_module.__getattr__("bias_ih_l" + str(i)) + b_hh = rnn_module.__getattr__("bias_hh_l" + str(i)) + flops += b_ih.shape[0] + b_hh.shape[0] + + flops *= batch_size + flops *= seq_length + if rnn_module.bidirectional: + flops *= 2 + rnn_module.__flops__ += int(flops) + + +def rnn_cell_forward_hook(rnn_cell_module, input, output): + flops = 0 + inp = input[0] + batch_size = inp.shape[0] + w_ih = rnn_cell_module.__getattr__("weight_ih") + w_hh = rnn_cell_module.__getattr__("weight_hh") + input_size = inp.shape[1] + flops = rnn_flops(flops, rnn_cell_module, w_ih, w_hh, input_size) + if rnn_cell_module.bias: + b_ih = rnn_cell_module.__getattr__("bias_ih") + b_hh = rnn_cell_module.__getattr__("bias_hh") + flops += b_ih.shape[0] + b_hh.shape[0] + + flops *= batch_size + rnn_cell_module.__flops__ += int(flops) + + +MODULE_HOOK_MAPPING = { + # RNN + nn.RNN: rnn_forward_hook, + nn.GRU: rnn_forward_hook, + nn.LSTM: rnn_forward_hook, + nn.RNNCell: rnn_cell_forward_hook, + nn.LSTMCell: rnn_cell_forward_hook, + nn.GRUCell: rnn_cell_forward_hook, +} + diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/tensor_flops_impl.py b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/tensor_flops_impl.py new file mode 100644 index 0000000000000000000000000000000000000000..ae30a3979accb1216766743c755ce6bcecd3d84f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/flops_calc_impl/tensor_flops_impl.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +from collections import OrderedDict +import numpy as np + +Tensor = torch.Tensor + +def _prod(dims): + p = 1 + for v in dims: + p *= v + return p + +def matmul_flops_compute(input, other, *, out=None): + """ + Count flops for the matmul operation. + """ + macs = _prod(input.shape) * other.shape[-1] + return 2 * macs, macs + + +def addmm_flops_compute(input, mat1, mat2, *, beta=1, alpha=1, out=None): + """ + Count flops for the addmm operation. + """ + macs = _prod(mat1.shape) * mat2.shape[-1] + return 2 * macs + _prod(input.shape), macs + + +def einsum_flops_compute(equation, *operands): + """ + Count flops for the einsum operation. + """ + equation = equation.replace(" ", "") + input_shapes = [o.shape for o in operands] + + # Re-map equation so that same equation with different alphabet + # representations will look the same. + letter_order = OrderedDict((k, 0) for k in equation if k.isalpha()).keys() + mapping = {ord(x): 97 + i for i, x in enumerate(letter_order)} + equation = equation.translate(mapping) + + np_arrs = [np.zeros(s) for s in input_shapes] + optim = np.einsum_path(equation, *np_arrs, optimize="optimal")[1] + for line in optim.split("\n"): + if "optimized flop" in line.lower(): + flop = int(float(line.split(":")[-1])) + return flop, 0 + raise NotImplementedError("Unsupported einsum operation.") + + +def einops_einsum_flops_compute(*args): + """ + Count flops for the einops.einsum operation. + """ + *operands, equation = args + input_shapes = [o.shape for o in operands] + + # Re-map equation so that same equation with different alphabet + # representations will look the same. + letter_order = OrderedDict((k, 0) for k in equation if k.isalpha()).keys() + mapping = {ord(x): 97 + i for i, x in enumerate(letter_order)} + equation = equation.translate(mapping) + + np_arrs = [np.zeros(s) for s in input_shapes] + optim = np.einsum_path(equation, *np_arrs, optimize="optimal")[1] + for line in optim.split("\n"): + if "optimized flop" in line.lower(): + flop = int(float(line.split(":")[-1])) + return flop, 0 + + raise NotImplementedError("Unsupported einops.einsum operation.") + + +def tensor_addmm_flops_compute(self, mat1, mat2, *, beta=1, alpha=1, out=None): + """ + Count flops for the tensor addmm operation. + """ + macs = _prod(mat1.shape) * mat2.shape[-1] + return 2 * macs + _prod(self.shape), macs + + +def mul_flops_compute(input, other, *, out=None): + return elementwise_flops_compute(input, other) + + +def add_flops_compute(input, other, *, alpha=1, out=None): + return elementwise_flops_compute(input, other) + + +def elementwise_flops_compute(input, other): + if not torch.is_tensor(input): + if torch.is_tensor(other): + return _prod(other.shape), 0 + else: + return 1, 0 + elif not torch.is_tensor(other): + return _prod(input.shape), 0 + else: + dim_input = len(input.shape) + dim_other = len(other.shape) + max_dim = max(dim_input, dim_other) + + final_shape = [] + for i in range(max_dim): + in_i = input.shape[i] if i < dim_input else 1 + ot_i = other.shape[i] if i < dim_other else 1 + if in_i > ot_i: + final_shape.append(in_i) + else: + final_shape.append(ot_i) + flops = _prod(final_shape) + return flops, 0 diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/flops_profiler.py b/Meissonic/InfinityStar/infinity/utils/mfu/flops_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..8644d398c8c95b2f31e541fcb44071acf02bb366 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/flops_profiler.py @@ -0,0 +1,374 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +# source: https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/profiling/flops_profiler + +# DeepSpeed Team +import os +import time +import torch +import torch.nn.functional as F +import logging +from functools import partial +import einops + +from .flops_calc_impl.func_flops_impl import * +from .flops_calc_impl.nn_flops_impl import * +from .flops_calc_impl.tensor_flops_impl import * +from .flops_calc_impl.custom_flops_impl import * + +logger = logging.getLogger(__name__) + +old_functions = {} + +DEFAULT_PRECISION = 2 + +class FlopsProfiler(object): + """Measures the latency, number of estimated floating-point operations and parameters of each module in a PyTorch model. + + The flops-profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module. It shows how latency, flops and parameters are spent in the model and which modules or layers could be the bottleneck. It also outputs the names of the top k modules in terms of aggregated latency, flops, and parameters at depth l with k and l specified by the user. The output profile is computed for each batch of input. + The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a standalone package. + When using DeepSpeed for model training, the flops profiler can be configured in the deepspeed_config file and no user code change is required. + + If using the profiler as a standalone package, one imports the flops_profiler package and use the APIs. + + Here is an example for usage in a typical training workflow: + + .. code-block:: python + + model = Model() + prof = FlopsProfiler(model) + + for step, batch in enumerate(data_loader): + if step == profile_step: + prof.start_profile() + + loss = model(batch) + + if step == profile_step: + flops = prof.get_total_flops() + prof.end_profile() + + loss.backward() + optimizer.step() + + To profile a trained model in inference, use the `get_model_profile` API. + + Args: + object (torch.nn.Module): The PyTorch model to profile. + """ + + def __init__(self): + self.models = [] + self.started = False + self.func_patched = False + self.module_flop_count = [] + self.detail_flops = "" + + def append(self, model): + self.models.append(model) + + def start_profile(self, ignore_list=None): + """Starts profiling. + + Extra attributes are added recursively to all the modules and the profiled torch.nn.functionals are monkey patched. + + Args: + ignore_list (list, optional): the list of modules to ignore while profiling. Defaults to None. + """ + self.ignore_list = ignore_list + self.reset_profile() + _patch_functionals(self.module_flop_count) + _patch_tensor_methods(self.module_flop_count) + _patch_miscellaneous_operations(self.module_flop_count) + + def register_module_hooks(module, ignore_list): + if ignore_list and type(module) in ignore_list: + return + + # if computing the flops of a module directly + if type(module) in MODULE_HOOK_MAPPING: + if not hasattr(module, "__flops_handle__"): + module.__flops_handle__ = module.register_forward_hook(MODULE_HOOK_MAPPING[type(module)]) + return + + if type(module) in CUSTOM_HOOK_MAPPING: + if not hasattr(module, "__flops_handle__"): + module.__flops_handle__ = module.register_forward_hook(CUSTOM_HOOK_MAPPING[type(module)], with_kwargs=True) + return + + # if computing the flops of the functionals in a module + def pre_hook(module, input): + self.module_flop_count.append([]) + + if not hasattr(module, "__pre_hook_handle__"): + module.__pre_hook_handle__ = module.register_forward_pre_hook(pre_hook) + + def post_hook(module, input, output): + if self.module_flop_count: + + if torch.is_grad_enabled(): + module.__flops__ += sum([elem[1] for elem in self.module_flop_count[-1]]) * (3 if module.training else 1) + + self.module_flop_count.pop() + + if not hasattr(module, "__post_hook_handle__"): + module.__post_hook_handle__ = module.register_forward_hook(post_hook) + + + for model in self.models: + model.apply(partial(register_module_hooks, ignore_list=ignore_list)) + + self.started = True + self.func_patched = True + logger.info("Flops profiler started") + + def stop_profile(self): + """Stop profiling. + + All torch.nn.functionals are restored to their originals. + """ + self.module_flop_count.clear() + if self.started and self.func_patched: + _reload_functionals() + _reload_tensor_methods() + _reload_miscellaneous_operations() + self.func_patched = False + + def remove_profile_attrs(module): + if hasattr(module, "__pre_hook_handle__"): + module.__pre_hook_handle__.remove() + del module.__pre_hook_handle__ + if hasattr(module, "__post_hook_handle__"): + module.__post_hook_handle__.remove() + del module.__post_hook_handle__ + if hasattr(module, "__flops_handle__"): + module.__flops_handle__.remove() + del module.__flops_handle__ + + for model in self.models: + model.apply(remove_profile_attrs) + + def reset_profile(self): + """Resets the profiling. + + Adds or resets the extra attributes. + """ + self.module_flop_count.clear() + def add_or_reset_attrs(module): + module.__flops__ = 0 + + for model in self.models: + model.apply(add_or_reset_attrs) + + def end_profile(self): + """Ends profiling. + + The added attributes and handles are removed recursively on all the modules. + """ + if not self.started: + return + self.stop_profile() + self.started = False + self.module_flop_count.clear() + + def remove_profile_attrs(module): + if hasattr(module, "__flops__"): + del module.__flops__ + + for model in self.models: + model.apply(remove_profile_attrs) + logger.info("Flops profiler finished") + + def get_total_flops(self): + """Returns the total flops of the model. + + Returns: + The number of multiply-accumulate operations of the model forward pass. + """ + total_flops = 0 + self.detail_flops = "" + for model in self.models: + flops, log = get_module_flops(model, prefix="") + total_flops += flops + self.detail_flops += log + return total_flops, self.detail_flops + +def wrapFunc(func, funcFlopCompute, module_flop_count): + oldFunc = func + name = func.__str__ + old_functions[name] = oldFunc + + @torch.compiler.disable() + def newFunc(*args, **kwds): + flops, macs = funcFlopCompute(*args, **kwds) + if module_flop_count: + module_flop_count[-1].append((name, flops, func.__name__)) + return oldFunc(*args, **kwds) + + newFunc.__str__ = func.__str__ + + return newFunc + + +def _patch_functionals(module_flop_count): + # FC + F.linear = wrapFunc(F.linear, linear_flops_compute, module_flop_count) + + # convolutions + F.conv1d = wrapFunc(F.conv1d, conv_flops_compute, module_flop_count) + F.conv2d = wrapFunc(F.conv2d, conv_flops_compute, module_flop_count) + F.conv3d = wrapFunc(F.conv3d, conv_flops_compute, module_flop_count) + + # conv transposed + F.conv_transpose1d = wrapFunc(F.conv_transpose1d, conv_trans_flops_compute, module_flop_count) + F.conv_transpose2d = wrapFunc(F.conv_transpose2d, conv_trans_flops_compute, module_flop_count) + F.conv_transpose3d = wrapFunc(F.conv_transpose3d, conv_trans_flops_compute, module_flop_count) + + # activations + F.relu = wrapFunc(F.relu, relu_flops_compute, module_flop_count) + F.prelu = wrapFunc(F.prelu, prelu_flops_compute, module_flop_count) + F.elu = wrapFunc(F.elu, elu_flops_compute, module_flop_count) + F.leaky_relu = wrapFunc(F.leaky_relu, leaky_relu_flops_compute, module_flop_count) + F.relu6 = wrapFunc(F.relu6, relu6_flops_compute, module_flop_count) + if hasattr(F, "silu"): + F.silu = wrapFunc(F.silu, silu_flops_compute, module_flop_count) + F.gelu = wrapFunc(F.gelu, gelu_flops_compute, module_flop_count) + + # Normalizations + F.batch_norm = wrapFunc(F.batch_norm, batch_norm_flops_compute, module_flop_count) + F.layer_norm = wrapFunc(F.layer_norm, layer_norm_flops_compute, module_flop_count) + F.instance_norm = wrapFunc(F.instance_norm, instance_norm_flops_compute, module_flop_count) + F.group_norm = wrapFunc(F.group_norm, group_norm_flops_compute, module_flop_count) + + # poolings + F.avg_pool1d = wrapFunc(F.avg_pool1d, pool_flops_compute, module_flop_count) + F.avg_pool2d = wrapFunc(F.avg_pool2d, pool_flops_compute, module_flop_count) + F.avg_pool3d = wrapFunc(F.avg_pool3d, pool_flops_compute, module_flop_count) + F.max_pool1d = wrapFunc(F.max_pool1d, pool_flops_compute, module_flop_count) + F.max_pool2d = wrapFunc(F.max_pool2d, pool_flops_compute, module_flop_count) + F.max_pool3d = wrapFunc(F.max_pool3d, pool_flops_compute, module_flop_count) + F.adaptive_avg_pool1d = wrapFunc(F.adaptive_avg_pool1d, pool_flops_compute, module_flop_count) + F.adaptive_avg_pool2d = wrapFunc(F.adaptive_avg_pool2d, pool_flops_compute, module_flop_count) + F.adaptive_avg_pool3d = wrapFunc(F.adaptive_avg_pool3d, pool_flops_compute, module_flop_count) + F.adaptive_max_pool1d = wrapFunc(F.adaptive_max_pool1d, pool_flops_compute, module_flop_count) + F.adaptive_max_pool2d = wrapFunc(F.adaptive_max_pool2d, pool_flops_compute, module_flop_count) + F.adaptive_max_pool3d = wrapFunc(F.adaptive_max_pool3d, pool_flops_compute, module_flop_count) + + # upsample + F.upsample = wrapFunc(F.upsample, upsample_flops_compute, module_flop_count) + F.interpolate = wrapFunc(F.interpolate, upsample_flops_compute, module_flop_count) + + # softmax + F.softmax = wrapFunc(F.softmax, softmax_flops_compute, module_flop_count) + + # embedding + F.embedding = wrapFunc(F.embedding, embedding_flops_compute, module_flop_count) + + # attn - scaled_dot_product_attention added in torch 2.0+ + F.scaled_dot_product_attention = wrapFunc(F.scaled_dot_product_attention, attn_flops_compute, module_flop_count) + +def _patch_tensor_methods(module_flop_count): + torch.matmul = wrapFunc(torch.matmul, matmul_flops_compute, module_flop_count) + torch.Tensor.matmul = wrapFunc(torch.Tensor.matmul, matmul_flops_compute, module_flop_count) + torch.Tensor.__matmul__ = wrapFunc(torch.Tensor.__matmul__, matmul_flops_compute, module_flop_count) + torch.mm = wrapFunc(torch.mm, matmul_flops_compute, module_flop_count) + torch.Tensor.mm = wrapFunc(torch.Tensor.mm, matmul_flops_compute, module_flop_count) + torch.bmm = wrapFunc(torch.bmm, matmul_flops_compute, module_flop_count) + torch.Tensor.bmm = wrapFunc(torch.Tensor.bmm, matmul_flops_compute, module_flop_count) + + torch.addmm = wrapFunc(torch.addmm, addmm_flops_compute, module_flop_count) + torch.Tensor.addmm = wrapFunc(torch.Tensor.addmm, tensor_addmm_flops_compute, module_flop_count) + + torch.mul = wrapFunc(torch.mul, mul_flops_compute, module_flop_count) + torch.Tensor.mul = wrapFunc(torch.Tensor.mul, mul_flops_compute, module_flop_count) + + torch.add = wrapFunc(torch.add, add_flops_compute, module_flop_count) + torch.Tensor.add = wrapFunc(torch.Tensor.add, add_flops_compute, module_flop_count) + + torch.einsum = wrapFunc(torch.einsum, einsum_flops_compute, module_flop_count) + + torch.baddbmm = wrapFunc(torch.baddbmm, tensor_addmm_flops_compute, module_flop_count) + + +def _patch_miscellaneous_operations(module_flop_count): + einops.einsum = wrapFunc(einops.einsum, einops_einsum_flops_compute, module_flop_count) + + +def _reload_functionals(): + # torch.nn.functional does not support importlib.reload() + F.linear = old_functions[F.linear.__str__] + F.conv1d = old_functions[F.conv1d.__str__] + F.conv2d = old_functions[F.conv2d.__str__] + F.conv3d = old_functions[F.conv3d.__str__] + F.conv_transpose1d = old_functions[F.conv_transpose1d.__str__] + F.conv_transpose2d = old_functions[F.conv_transpose2d.__str__] + F.conv_transpose3d = old_functions[F.conv_transpose3d.__str__] + F.relu = old_functions[F.relu.__str__] + F.prelu = old_functions[F.prelu.__str__] + F.elu = old_functions[F.elu.__str__] + F.leaky_relu = old_functions[F.leaky_relu.__str__] + F.relu6 = old_functions[F.relu6.__str__] + if hasattr(F, "silu"): + F.silu = old_functions[F.silu.__str__] + F.gelu = old_functions[F.gelu.__str__] + F.batch_norm = old_functions[F.batch_norm.__str__] + F.layer_norm = old_functions[F.layer_norm.__str__] + F.instance_norm = old_functions[F.instance_norm.__str__] + F.group_norm = old_functions[F.group_norm.__str__] + F.avg_pool1d = old_functions[F.avg_pool1d.__str__] + F.avg_pool2d = old_functions[F.avg_pool2d.__str__] + F.avg_pool3d = old_functions[F.avg_pool3d.__str__] + F.max_pool1d = old_functions[F.max_pool1d.__str__] + F.max_pool2d = old_functions[F.max_pool2d.__str__] + F.max_pool3d = old_functions[F.max_pool3d.__str__] + F.adaptive_avg_pool1d = old_functions[F.adaptive_avg_pool1d.__str__] + F.adaptive_avg_pool2d = old_functions[F.adaptive_avg_pool2d.__str__] + F.adaptive_avg_pool3d = old_functions[F.adaptive_avg_pool3d.__str__] + F.adaptive_max_pool1d = old_functions[F.adaptive_max_pool1d.__str__] + F.adaptive_max_pool2d = old_functions[F.adaptive_max_pool2d.__str__] + F.adaptive_max_pool3d = old_functions[F.adaptive_max_pool3d.__str__] + F.upsample = old_functions[F.upsample.__str__] + F.interpolate = old_functions[F.interpolate.__str__] + F.softmax = old_functions[F.softmax.__str__] + F.embedding = old_functions[F.embedding.__str__] + + +def _reload_tensor_methods(): + torch.matmul = old_functions[torch.matmul.__str__] + torch.Tensor.matmul = old_functions[torch.Tensor.matmul.__str__] + torch.mm = old_functions[torch.mm.__str__] + torch.Tensor.mm = old_functions[torch.Tensor.mm.__str__] + torch.bmm = old_functions[torch.matmul.__str__] + torch.Tensor.bmm = old_functions[torch.Tensor.bmm.__str__] + torch.addmm = old_functions[torch.addmm.__str__] + torch.Tensor.addmm = old_functions[torch.Tensor.addmm.__str__] + torch.mul = old_functions[torch.mul.__str__] + torch.Tensor.mul = old_functions[torch.Tensor.mul.__str__] + torch.add = old_functions[torch.add.__str__] + torch.Tensor.add = old_functions[torch.Tensor.add.__str__] + + torch.einsum = old_functions[torch.einsum.__str__] + + torch.baddbmm = old_functions[torch.baddbmm.__str__] + + +def _reload_miscellaneous_operations(): + einops.einsum = old_functions[einops.einsum.__str__] + +# can not iterate over all submodules using self.model.modules() +# since modules() returns duplicate modules only once +def get_module_flops(module, prefix=""): + sum = module.__flops__ + log = "" + + if os.getenv("RANK","0") == "0": + log = f"| {prefix}{module.__class__} flops = {sum/1e12:.5f} T\n" + + + for child in module.children(): + flop,clog = get_module_flops(child, prefix=prefix+" ") + sum += flop + log += clog + + return sum, log diff --git a/Meissonic/InfinityStar/infinity/utils/mfu/mfu.py b/Meissonic/InfinityStar/infinity/utils/mfu/mfu.py new file mode 100644 index 0000000000000000000000000000000000000000..13c772657732ed24c2d1560f12d60aeeb463cb8f --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/mfu/mfu.py @@ -0,0 +1,163 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import time +import torch +import torch.distributed as dist +from contextlib import contextmanager, nullcontext +from functools import wraps +from .flops_profiler import FlopsProfiler +from .flops_calc_impl.custom_flops_impl import CUSTOM_HOOK_MAPPING, CUSTOM_NAME_MAPPING + +class _MFU: + def __init__(self, calibration_steps = 5, repeat_after_steps = -1): + """ + calibration_steps = -1 means always do calibration, has a very little overhead + repeat_after_steps = -1 means never repeat + """ + self.profs = [] + self.iter_time = None + self.is_during_calibration = False + self.calibration_steps = calibration_steps + self.repeat_after_steps = repeat_after_steps + self.steps = 0 + self.flops = [] + self.detail_flops = "" + self.ideal_TFLOPS = self._get_device_tflops() + self.ignore_list=[] + self.prof = FlopsProfiler() + + def append(self, model): + self.prof.append(model) + + def step(self, iter_time): + self.steps += 1 + self.iter_time = iter_time + + if self.calibration_steps < 0 or self.steps <= self.calibration_steps: + self.is_during_calibration = True + flop = 0 + + try: + flop, log = self.prof.get_total_flops() + except Exception as e: + print(f"[WARN]: get_total_flops failed {e}") + + self.detail_flops = log + self.flops.append(flop) + self.reset() + + if self.steps == self.calibration_steps: + self.is_during_calibration = False + self.clear() + + if self.calibration_steps > 0 and self.repeat_after_steps > 0: + if self.steps >= self.calibration_steps + self.repeat_after_steps: + self.flops.clear() + self.steps = 0 + self.start() + + + def stop(self): + self.prof.stop_profile() + + def reset(self): + self.prof.reset_profile() + + def clear(self): + self.prof.end_profile() + + def start(self): + self.prof.start_profile(self.ignore_list) + + def get_flops_detail_info(self): + return self.detail_flops + + def get_mfu(self): + mfu = -1 + if self.iter_time is not None and len(self.flops) > 0: + avg_flop = sum(self.flops) / len(self.flops) + avg_Tflops = avg_flop / 1e12 + mfu = avg_Tflops / self.iter_time / self.ideal_TFLOPS + if not isinstance(mfu, float): + print(f"[WARN]: Something wrong with mfu calc, {type(mfu)=}.") + mfu = -1 + + return mfu + + def _get_device_tflops(self): + peak_tflops = -1 + arch = torch.cuda.get_device_capability() + if arch[0] == 8 and arch[1] == 0: # A100/A800 + peak_tflops = 312 # fp16 without sparsity + elif arch[0] == 9 and arch[1] == 0: # H100/H800 + peak_tflops = 989 # fp16 without sparsity + else: + print(f"unknown default tflops of device capability {arch[0]}.{arch[1]}") + return peak_tflops + + + +class mfutool: + _mfu = None + _last_time = None + _iter_time = None + + @staticmethod + def setup(calibration_steps = 5, repeat_after_steps = -1): + """ + calibration_steps = -1 means always do calibration, has a very little overhead + repeat_after_steps = -1 means never repeat + """ + if mfutool._mfu is None: + mfutool._mfu = _MFU(calibration_steps = calibration_steps, repeat_after_steps = repeat_after_steps) + + @staticmethod + def add(model): + if mfutool._mfu is None: + mfutool._mfu = _MFU() + mfutool._mfu.append(model) + + @staticmethod + def enable(): + if mfutool._mfu is not None: + mfutool._mfu.start() + + @staticmethod + def disable(): + if mfutool._mfu is not None: + mfutool._mfu.stop() + + @staticmethod + def step(): + if mfutool._mfu is not None: + if mfutool._last_time is not None: + mfutool._iter_time = time.time() - mfutool._last_time + mfutool._mfu.step(mfutool._iter_time) + mfutool._last_time = time.time() + + @staticmethod + def iter_time(): + return mfutool._iter_time + + @staticmethod + def get_mfu(): + if mfutool._mfu is not None: + return mfutool._mfu.get_mfu() + + @staticmethod + def get_flops_detail_info(): + if mfutool._mfu is not None: + return mfutool._mfu.get_flops_detail_info() + + @staticmethod + def register_custom(name, func): + if name not in CUSTOM_NAME_MAPPING: + print(f"[WARN] cannot find {name}, decorate your module class with @mfutool.custom_flops first") + return + CUSTOM_HOOK_MAPPING[CUSTOM_NAME_MAPPING[name]] = func + + @staticmethod + def custom_flops(cls, name): + CUSTOM_NAME_MAPPING[name] = cls + return cls + diff --git a/Meissonic/InfinityStar/infinity/utils/misc.py b/Meissonic/InfinityStar/infinity/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..6adeb12575270f506faeb755085c70f7c9b07f82 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/misc.py @@ -0,0 +1,322 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import datetime +import functools +import math +import os +import random +import subprocess +import sys +import threading +import time +from collections import defaultdict, deque +from typing import Iterator, List, Tuple + +import numpy as np +import pytz +import torch +import torch.distributed as tdist +import torch.nn.functional as F + +import infinity.utils.dist as dist + +os_system = functools.partial(subprocess.call, shell=True) +def echo(info): + os_system(f'echo "[$(date "+%m-%d-%H:%M:%S")] ({os.path.basename(sys._getframe().f_back.f_code.co_filename)}, line{sys._getframe().f_back.f_lineno})=> {info}"') +def os_system_get_stdout(cmd): + return subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8') +def os_system_get_stdout_stderr(cmd): + cnt = 0 + while True: + try: + sp = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=30) + except subprocess.TimeoutExpired: + cnt += 1 + print(f'[fetch free_port file] timeout cnt={cnt}') + else: + return sp.stdout.decode('utf-8'), sp.stderr.decode('utf-8') + + +def is_pow2n(x): + return x > 0 and (x & (x - 1) == 0) + + +def time_str(fmt='[%m-%d %H:%M:%S]'): + return datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime(fmt) + + +class DistLogger(object): + def __init__(self, lg): + self._lg = lg + + @staticmethod + def do_nothing(*args, **kwargs): + pass + + def __getattr__(self, attr: str): + return getattr(self._lg, attr) if self._lg is not None else DistLogger.do_nothing + +class TensorboardLogger(object): + def __init__(self, log_dir, filename_suffix): + try: import tensorflow_io as tfio + except: pass + from torch.utils.tensorboard import SummaryWriter + self.writer = SummaryWriter(log_dir=log_dir, filename_suffix=filename_suffix) + self.step = 0 + + def set_step(self, step=None): + if step is not None: + self.step = step + else: + self.step += 1 + + def loggable(self): + return self.step == 0 or (self.step + 1) % 500 == 0 + + def update(self, head='scalar', step=None, **kwargs): + if step is None: + step = self.step + if not self.loggable(): return + for k, v in kwargs.items(): + if v is None: continue + if hasattr(v, 'item'): v = v.item() + self.writer.add_scalar(f'{head}/{k}', v, step) + + def log_tensor_as_distri(self, tag, tensor1d, step=None): + if step is None: + step = self.step + if not self.loggable(): return + try: + self.writer.add_histogram(tag=tag, values=tensor1d, global_step=step) + except Exception as e: + print(f'[log_tensor_as_distri writer.add_histogram failed]: {e}') + + def log_image(self, tag, img_chw, step=None): + if step is None: + step = self.step + if not self.loggable(): return + self.writer.add_image(tag, img_chw, step, dataformats='CHW') + + def flush(self): + self.writer.flush() + + def close(self): + self.writer.close() + + +class TouchingDaemonDontForgetToStartMe(threading.Thread): + def __init__(self, files: List[str], sleep_secs: int, verbose=False): + super().__init__(daemon=True) + self.files = tuple(files) + self.sleep_secs = sleep_secs + self.is_finished = False + self.verbose = verbose + + f_back = sys._getframe().f_back + file_desc = f'{f_back.f_code.co_filename:24s}'[-24:] + self.print_prefix = f' ({file_desc}, line{f_back.f_lineno:-4d}) @daemon@ ' + + def finishing(self): + self.is_finished = True + + def run(self) -> None: + kw = {} + if tdist.is_initialized(): kw['clean'] = True + + stt = time.time() + if self.verbose: print(f'{time_str()}{self.print_prefix}[TouchingDaemon tid={threading.get_native_id()}] start touching {self.files} per {self.sleep_secs}s ...', **kw) + while not self.is_finished: + for f in self.files: + if os.path.exists(f): + try: + os.utime(f) + fp = open(f, 'a') + fp.close() + except: pass + time.sleep(self.sleep_secs) + + if self.verbose: print(f'{time_str()}{self.print_prefix}[TouchingDaemon tid={threading.get_native_id()}] finish touching after {time.time()-stt:.1f} secs {self.files} per {self.sleep_secs}s. ', **kw) + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=30, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + tdist.barrier() + tdist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + return np.median(self.deque) if len(self.deque) else 0 + + @property + def avg(self): + return sum(self.deque) / (len(self.deque) or 1) + + @property + def global_avg(self): + return self.total / (self.count or 1) + + @property + def max(self): + return max(self.deque) if len(self.deque) else 0 + + @property + def value(self): + return self.deque[-1] if len(self.deque) else 0 + + def time_preds(self, counts) -> Tuple[float, str, str]: + remain_secs = counts * self.median + return remain_secs, str(datetime.timedelta(seconds=round(remain_secs))), time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time() + remain_secs)) + + def __str__(self): + return self.fmt.format(median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value) + + +class MetricLogger(object): + def __init__(self): + self.meters = defaultdict(SmoothedValue) + self.iter_end_t = time.time() + self.log_iters = set() + self.log_every_iter = False + + def update(self, **kwargs): + # if it != 0 and it not in self.log_iters: return + for k, v in kwargs.items(): + if v is None: continue + if hasattr(v, 'item'): v = v.item() + # assert isinstance(v, (float, int)), type(v) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + if len(meter.deque): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return ' '.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, start_it, max_iters, itrt, log_freq, log_every_iter=False, header='', args=None): # also solve logging & skipping iterations before start_it + start_it = start_it % max_iters + self.log_iters = set(range(start_it, max_iters, log_freq)) + self.log_iters.add(start_it) + self.log_iters.add(max_iters-1) + self.log_iters.add(max_iters) + self.log_every_iter = log_every_iter + self.iter_end_t = time.time() + self.iter_time = SmoothedValue(fmt='{value:.4f}') + self.data_time = SmoothedValue(fmt='{value:.3f}') + header_fmt = header + ': [{0:' + str(len(str(max_iters))) + 'd}/{1}]' + + start_time = time.time() + if isinstance(itrt, Iterator) and not hasattr(itrt, 'preload') and not hasattr(itrt, 'set_epoch'): # this + for it in range(start_it, max_iters): + obj = next(itrt) + if it < start_it: continue + if args is not None and args.twoclip_alternatingtraining: # 2 clips alternating training + T = obj['raw_features_bcthw'][0].shape[1] + while (it % 2 == 0 and T > 21) or (it % 2 > 0 and T <= 21): + obj = next(itrt) + T = obj['raw_features_bcthw'][0].shape[1] + self.data_time.update(time.time() - self.iter_end_t) + yield it, obj + self.iter_time.update(time.time() - self.iter_end_t) + if self.log_every_iter or it in self.log_iters: + eta_seconds = self.iter_time.avg * (max_iters - it) + print(f'{header_fmt.format(it, max_iters)} eta: {str(datetime.timedelta(seconds=int(eta_seconds)))} {str(self)} T: {self.iter_time.value:.3f}s dataT: {self.data_time.value*1e3:.1f}ms', flush=True) + self.iter_end_t = time.time() + else: + if isinstance(itrt, int): itrt = range(itrt) + for it, obj in enumerate(itrt): + if it < start_it: + self.iter_end_t = time.time() + continue + self.data_time.update(time.time() - self.iter_end_t) + yield it, obj + self.iter_time.update(time.time() - self.iter_end_t) + if self.log_every_iter or it in self.log_iters: + eta_seconds = self.iter_time.avg * (max_iters - it) + print(f'{header_fmt.format(it, max_iters)} eta: {str(datetime.timedelta(seconds=int(eta_seconds)))} {str(self)} T: {self.iter_time.value:.3f}s dataT: {self.data_time.value*1e3:.1f}ms', flush=True) + self.iter_end_t = time.time() + cost = time.time() - start_time + cost_str = str(datetime.timedelta(seconds=int(cost))) + print(f'{header} Cost of this ep: {cost_str} ({cost / (max_iters-start_it):.3f} s / it)', flush=True) + + +class NullDDP(torch.nn.Module): + def __init__(self, module, *args, **kwargs): + super(NullDDP, self).__init__() + self.module = module + self.require_backward_grad_sync = False + + def forward(self, *args, **kwargs): + return self.module(*args, **kwargs) + + +def build_2d_sincos_position_embedding(h, w, embed_dim, temperature=10000., sc=0, verbose=True): # (1, hw**2, embed_dim) + # DiT: sc=0 + # DETR: sc=2? + grid_w = torch.arange(w, dtype=torch.float32) + grid_h = torch.arange(h, dtype=torch.float32) + grid_w, grid_h = torch.meshgrid([grid_w, grid_h], indexing='ij') + if sc == 0: + scale = 1 + elif sc == 1: + scale = math.pi * 2 / w + else: + scale = 1 / w + grid_w = scale * grid_w.reshape(h*w, 1) # scale * [0, 0, 0, 1, 1, 1, 2, 2, 2] + grid_h = scale * grid_h.reshape(h*w, 1) # scale * [0, 1, 2, 0, 1, 2, 0, 1, 2] + + assert embed_dim % 4 == 0, f'Embed dimension ({embed_dim}) must be divisible by 4 for 2D sin-cos position embedding!' + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = (-math.log(temperature) * omega).exp() + # omega == (1/T) ** (arange(pos_dim) / pos_dim), a vector only dependent on C + out_w = grid_w * omega.view(1, pos_dim) # out_w: scale * [0*ome, 0*ome, 0*ome, 1*ome, 1*ome, 1*ome, 2*ome, 2*ome, 2*ome] + out_h = grid_h * omega.view(1, pos_dim) # out_h: scale * [0*ome, 1*ome, 2*ome, 0*ome, 1*ome, 2*ome, 0*ome, 1*ome, 2*ome] + pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :] + if verbose: print(f'[build_2d_sincos_position_embedding @ {hw} x {hw}] scale_type={sc}, temperature={temperature:g}, shape={pos_emb.shape}') + return pos_emb # (1, hw**2, embed_dim) + + +if __name__ == '__main__': + pass diff --git a/Meissonic/InfinityStar/infinity/utils/profile/__init__.py b/Meissonic/InfinityStar/infinity/utils/profile/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..339c7fc9748c3c657770e2aa41c4d882b2319002 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/profile/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from .torch_profiler import torch_profiler +from .py_profiler import py_profiler diff --git a/Meissonic/InfinityStar/infinity/utils/profile/py_profiler.py b/Meissonic/InfinityStar/infinity/utils/profile/py_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..49c5b6f9a5e72a4ffce718a392a2375076c24372 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/profile/py_profiler.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import logging +import pstats +import cProfile +import contextlib + +def _colored(st, color, background=False): + return f"\u001b[{10*background+60*(color.upper() == color)+30+['black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white'].index(color.lower())}m{st}\u001b[0m" if color is not None else st + +def _format_fcn(fcn): + return f"{fcn[0]}:{fcn[1]}:{fcn[2]}" + +class py_profiler(contextlib.ContextDecorator): + def __init__(self, enabled=True, sort='cumtime', fn=None, ts=1): + self.enabled, self.sort, self.fn, self.time_scale = enabled, sort, fn, 1e3/ts + def __enter__(self): + self.pr = cProfile.Profile() + if self.enabled: + self.pr.enable() + def __exit__(self, *exc): + if self.enabled: + self.pr.disable() + if self.fn: + self.pr.dump_stats(self.fn) + stats = pstats.Stats(self.pr).strip_dirs().sort_stats(self.sort) + for fcn in stats.fcn_list[0:int(len(stats.fcn_list))]: + (_primitive_calls, num_calls, tottime, cumtime, callers) = stats.stats[fcn] + scallers = sorted(callers.items(), key=lambda x: -x[1][2]) + print(f"n:{num_calls:8d} tm:{tottime*self.time_scale:7.2f}ms tot:{cumtime*self.time_scale:7.2f}ms", _colored(_format_fcn(fcn).ljust(50), "yellow")) + + +if __name__ == "__main__": + def fn(): + s = 0 + for i in range(10000000): + s += i + return s + + with py_profiler(): + fn() diff --git a/Meissonic/InfinityStar/infinity/utils/profile/torch_profiler.py b/Meissonic/InfinityStar/infinity/utils/profile/torch_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..6321f024c82572200ef80a3486dab03c65a73a63 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/profile/torch_profiler.py @@ -0,0 +1,100 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import logging +import os +from contextlib import contextmanager, nullcontext +from datetime import datetime + +import torch +import torch.distributed as dist +from torch.profiler import record_function as torch_record_function + + +class _TraceHandler: + def __init__(self, save_path="/tmp/trace.json", logger=None, rank=None): + self.logger = logger + if logger is None: + self.logger = logging.getLogger(__name__) + + self.logger.info(f"trace dump path: {save_path}") + self.save_path = save_path + ".json.gz" + self.rank = rank + + def __call__(self, prof): + if self.logger is not None: + self.logger.info(f"dump trace to {self.save_path}") + prof.export_chrome_trace(self.save_path) + +class torch_profiler: + """ + usage: + + ```python + import pnp + + pnp.torch_profiler.setup(output_folder="./", wait_steps=30) + + for step in range(100): + pnp.torch_profiler.step() + ... + + with pnp.troch_profiler.mark("fwd"): + model.forward() + + ... + + with pnp.torch_profiler.mark("bwd"): + loss.backward() + + ``` + + """ + _TP = None + mark = nullcontext + + @staticmethod + def step(): + if torch_profiler._TP is None: + return + + torch_profiler._TP.step() + + @staticmethod + @property + def mark(): + return torch_profiler.mark + + @staticmethod + def setup(enabled=True, output_folder="./", file_prefix="", wait_steps=30): + """ + enabled: if False, profiler will do nothing + output_folder: the folder to dump trace + wait_steps: start profiling after wait_steps(in your training loop) + file_prefix: the prefix of the trace file for your custom + """ + if enabled: + if not os.path.exists(output_folder): + os.makedirs(output_folder, exist_ok=True) + + torch_profiler._TP = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait_steps, + warmup=3, + active=5, + repeat=0, + ), + with_stack=True, + record_shapes=True, + profile_memory=False, + on_trace_ready=_TraceHandler( + f"{output_folder}/{file_prefix}world_size-{dist.get_world_size()}-rank{dist.get_rank()}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}", + None, + dist.get_rank(), + ), + ) + torch_profiler._TP.start() + torch_profiler.mark = torch_record_function diff --git a/Meissonic/InfinityStar/infinity/utils/save_and_load.py b/Meissonic/InfinityStar/infinity/utils/save_and_load.py new file mode 100644 index 0000000000000000000000000000000000000000..a95d5480083efc50a835f89518e6a3627ce04a8b --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/save_and_load.py @@ -0,0 +1,247 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import gc +import os +import os.path as osp +import subprocess +import time +import re +from typing import List, Optional, Tuple + +import torch +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + +import glob +import shutil +from infinity.utils import arg_util +import infinity.utils.dist as dist + + +def glob_with_epoch_iter(pattern, recursive=False): + def extract_ep_iter(filename): + match = re.search(r'ep(\d+)-iter(\d+)', filename) + if match: + ep = int(match.group(1)) + iter_idx = int(match.group(2)) + return ep, iter_idx + return 0, 0 + return sorted(glob.glob(pattern, recursive=recursive), key=lambda x: extract_ep_iter(os.path.basename(x)), reverse=True) + + +def glob_with_global_step(pattern, recursive=False): + def extract_ep_iter(filename): + match = re.search(r'global_step_(\d+)', filename) + if match: + iter_idx = int(match.group(1)) + return iter_idx + return 0 + return sorted(glob.glob(pattern, recursive=recursive), key=lambda x: extract_ep_iter(os.path.basename(x)), reverse=True) + + +class CKPTSaver(object): + def __init__(self, is_master: bool, eval_milestone: List[Tuple[float, float]]): + self.is_master = is_master + self.time_stamp = torch.tensor([time.time() - 1e5, time.time()], device=dist.get_device()) + self.sp_also: subprocess.Popen = None + self.sp_best: subprocess.Popen = None + self.sp_backup: subprocess.Popen = None + self.acc_str, self.eval_milestone = '[no acc str]', eval_milestone + + def sav( + self, args: arg_util.Args, g_it: int, next_ep: int, next_it: int, trainer, + acc_str: Optional[str] = None, eval_milestone: Optional[List[Tuple[float, float]]] = None, + also_save_to: str = None, best_save_to: str = None, + ): + fname = f'global_step_{g_it}.pth' + local_out_ckpt = os.path.join(args.bed, fname) + trainer_state = trainer.state_dict() + stt = time.time() + if self.is_master: + torch.save({ + 'args': args.state_dict(), + 'arch': args.model, + 'epoch': next_ep, + 'iter': next_it, + 'trainer': trainer_state, + 'acc_str': self.acc_str, + 'g_it': g_it, + }, local_out_ckpt) + cost = time.time() - stt + print(f'Checkpoint save cost: {cost:.2f}s', flush=True) + print(f'Checkpoint save to: {local_out_ckpt}', flush=True) + + del trainer_state + gc.collect(), + torch.cuda.empty_cache() + dist.barrier() + + +def auto_resume(args: arg_util.Args, pattern='*.pth') -> Tuple[List[str], int, int, str, List[Tuple[float, float]], dict, dict]: + info = [] + resume = '' + if args.auto_resume: + all_ckpt = glob_with_global_step(os.path.join(args.bed, pattern)) + if len(all_ckpt) == 0: + info.append(f'[auto_resume] no ckpt found @ {pattern}') + info.append(f'[auto_resume quit]') + else: + resume = all_ckpt[0] + info.append(f'[auto_resume] auto load from @ {resume} ...') + else: + info.append(f'[auto_resume] disabled') + info.append(f'[auto_resume quit]') + + if len(resume) == 0: + return info, 0, 0, '[no acc str]', [], {}, {} + + print(f'auto resume from {resume}') + ckpt = torch.load(resume, map_location='cpu') + + dist.barrier() + ep, it, g_it = ckpt['epoch'], ckpt['iter'], ckpt['g_it'] + eval_milestone = ckpt.get('milestones', []) + info.append(f'[auto_resume success] resume from ep{ep}, it{it}, eval_milestone: {eval_milestone}') + return info, ep, g_it, ckpt.get('acc_str', '[no acc str]'), eval_milestone, ckpt['trainer'], ckpt['args'] + +def omnistore_auto_resume(args: arg_util.Args, pattern='ckpt*.pth'): + info = [] + resume = '' + if args.auto_resume: + for dd in (args.local_out_path, args.bed): + all_ckpt = glob_with_global_step(os.path.join(dd, pattern)) + if len(all_ckpt): break + if len(all_ckpt) == 0: + info.append(f'[auto_resume] no ckpt found @ {pattern}') + info.append(f'[auto_resume quit]') + else: + resume = all_ckpt[0] + info.append(f'[auto_resume] auto load from @ {resume} ...') + else: + info.append(f'[auto_resume] disabled') + info.append(f'[auto_resume quit]') + + return resume, info + + +class omnistoreCheckpoint(object): + def __init__(self, eval_milestone: List[Tuple[float, float]]): + self.time_stamp = torch.tensor([time.time() - 1e5, time.time()], device=dist.get_device()) + self.sp_also: subprocess.Popen = None + self.sp_best: subprocess.Popen = None + self.sp_backup: subprocess.Popen = None + self.acc_str, self.eval_milestone = '[no acc str]', eval_milestone + + def sav( + self, args: arg_util.Args, global_it: int, next_ep: int, next_it: int, fsdp_object: FSDP, optimizer_object: torch.optim.Optimizer, + acc_str: Optional[str] = None, eval_milestone: Optional[List[Tuple[float, float]]] = None, + ): + if acc_str is not None: self.acc_str = acc_str + if eval_milestone is not None: self.eval_milestone = eval_milestone + + stt = time.time() + + checkpoint_state = { + # 'model': { + # 'main_model': fsdp_object, + # 'ema_model': ema_fsdp_object, + # }, + 'model': fsdp_object, + # 'optimizer': optimizer_object, + 'extra_state': {} + } + + from omnistore import FSDPCheckpointer + print(f"{FSDPCheckpointer=}") + + FSDPCheckpointer.save( + path=args.bed, + checkpoint_state=checkpoint_state, + global_steps=global_it, + async_fast_checkpoint=True, + save_flatten_model_optimizer=True, + ) + if dist.is_master(): + torch.save({ + 'args': args.state_dict(), + 'next_ep': next_ep, + 'next_it': next_it, + 'global_it': global_it, + 'acc_str': self.acc_str, + 'milestones': self.eval_milestone, + }, os.path.join(args.bed, 'meta.pth')) + + if self.sp_backup is not None: + self.sp_backup.wait(timeout=300); self.sp_backup.kill(); self.sp_backup.communicate() + self.time_stamp[0] = time.time() + def auto_sync(source_filename, target_filename): + cmd = f'cp -r {source_filename} {target_filename}' + self.sp_backup = subprocess.Popen(cmd, shell=True, bufsize=-1) + print(f'[Saver] auto_save cmd: {cmd}', flush=True) + local_files = glob.glob(f"{args.local_out_path}/*.txt") + for filename in local_files: + basename = os.path.basename(filename) + target_filename = f'{args.bed}/{basename}' + auto_sync(filename, target_filename) + cost = time.time() - stt + print(f'[CKPTSaver][rank00][omnistore: {FSDPCheckpointer is not None}] cost={time.time()-stt:.2f}s, ckpt saved to global_step_{global_it}', flush=True) + + dist.barrier() + del checkpoint_state + + def load(self, ckpt_path, fsdp_object, optimizer_object): + from omnistore import FSDPCheckpointer + checkpoint_state = { + 'model': fsdp_object, + # 'optimizer': optimizer_object, + 'extra_state': {} + } + FSDPCheckpointer.load( + ckpt_path, + checkpoint_state, + load_flatten_model_optimizer=True, + ) + global_it = -1 + meta_path = os.path.join(os.path.dirname(ckpt_path), 'meta.pth') + if os.path.exists(meta_path): + train_meta = torch.load(meta_path) + args_state, next_ep, next_it, acc_str, milestones = train_meta['args'], train_meta['next_ep'], train_meta['next_it'], train_meta['acc_str'], train_meta['milestones'] + global_it = train_meta.get('global_it', -1) + else: + args_state, next_ep, next_it, acc_str, milestones = {}, 0, 0, '', [] + return args_state, next_ep, next_it, global_it, acc_str, milestones + +def merge_ckpt(omnistore_ckpt_path, output_path, fsdp_save_flatten_model, save=False): + print(f'merging omnistore ckpt into torch-format ckpt') + start = time.time() + from omnistore.utilities.ckpt_format_tool import omnistore_ckpt_to_pytorch_ckpt + state_dict = omnistore_ckpt_to_pytorch_ckpt( + save_path=omnistore_ckpt_path, + output_path=output_path, + framework="fsdp", + model_only=True, + return_dict=True, + fsdp_save_flatten_model=fsdp_save_flatten_model, + ) + print(f"ckpt merged in {time.time() - start:.2f} seconds") + state_dict_model = state_dict["model"] + if '.cfg_uncond' in state_dict_model: + state_dict_model['cfg_uncond'] = state_dict_model['.cfg_uncond'] + del state_dict_model['.cfg_uncond'] + if '.pos_start' in state_dict_model: + state_dict_model['pos_start'] = state_dict_model['.pos_start'] + del state_dict_model['.pos_start'] + if '.sos_token' in state_dict_model: + state_dict_model['sos_token'] = state_dict_model['.sos_token'] + del state_dict_model['.sos_token'] + if 'semantic_head.weight' in state_dict_model: + print(f'[rush_resume] replace semantic_head with semantic_head2') + state_dict_model['semantic_head2.weight'] = state_dict_model['semantic_head.weight'] + state_dict_model['semantic_head2.bias'] = state_dict_model['semantic_head.bias'] + del state_dict_model['semantic_head.weight'] + del state_dict_model['semantic_head.bias'] + if save: + save_file = os.path.join(output_path, "slim-model.pt") + print(f'save to {save_file}') + torch.save(state_dict_model, save_file) + return state_dict_model diff --git a/Meissonic/InfinityStar/infinity/utils/sequence_parallel.py b/Meissonic/InfinityStar/infinity/utils/sequence_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..ceb7e1865bf3fc8042554cca799f17da5fd92dda --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/sequence_parallel.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import torch +import torch.nn as nn +import torch.distributed as dist +from .comm.pg_utils import ProcessGroupManager +from .comm.comm import set_sp_comm_group, split_sequence, gather_sequence, all_to_all_comm +from .comm.operation import gather_forward_split_backward + +class SequenceParallelManager: + _SP_GROUP = None + _SP_SIZE = 0 + + @staticmethod + def sp_on(): + return SequenceParallelManager._SP_GROUP is not None + + @staticmethod + def init_sp(sp_size): + if SequenceParallelManager._SP_GROUP is not None: + print("WARN: sequence parallel group is already initialized") + return + + if sp_size <= 1: + print(f"WARN: sequence parallel size must > 1 but got {sp_size}") + return + + world_size = dist.get_world_size() + assert world_size % sp_size == 0, f"world_size {world_size} must be divisible by sp_size({sp_size})" + SequenceParallelManager._SP_SIZE = sp_size + + pm = ProcessGroupManager( + world_size // sp_size, + sp_size, + dp_axis=0, + sp_axis=1, + ) + pm_group = pm.sp_group + set_sp_comm_group(pm_group) + SequenceParallelManager._SP_GROUP = pm_group + return + + @staticmethod + def get_sp_group(): + return SequenceParallelManager._SP_GROUP + + @staticmethod + def get_sp_size(): + return SequenceParallelManager._SP_SIZE + + @staticmethod + def get_sp_group_nums(): + # if 2 sp_size, 8 ranks, group nums is 4 + if SequenceParallelManager.sp_on(): + world_size = torch.distributed.get_world_size() + return world_size // SequenceParallelManager._SP_SIZE + else: + return 0 + + @staticmethod + def get_sp_rank(): + if SequenceParallelManager.sp_on(): + global_rank = torch.distributed.get_rank() + sp_rank = global_rank % SequenceParallelManager._SP_SIZE + return sp_rank + else: + return 0 + + def get_sp_group_rank(): + if SequenceParallelManager.sp_on(): + global_rank = torch.distributed.get_rank() + sp_group_rank = global_rank // SequenceParallelManager._SP_SIZE + return sp_group_rank + else: + return 0 + +def sp_split_sequence_by_dim(seq, seqlen_dim=1) -> torch.Tensor: + """ + split the raw sequence by seqlen_dim + """ + return split_sequence(seq, SequenceParallelManager.get_sp_group(), seqlen_dim, 'down') + +def sp_gather_sequence_by_dim(seq, seqlen_dim=1) -> torch.Tensor: + """ + gather seqlen_dim to recover raw sequence + """ + return gather_sequence(seq, SequenceParallelManager.get_sp_group(), seqlen_dim, 'up') + +def sp_all_to_all(ts, scatter_dim, gather_dim): + """ + reorder the tensor's dimension, like [raw_seq_len/sp_size, hidden_dim] to [raw_seq_len, hidden_dim/sp_size] + + scatter_dim: the dimension to split the tensor + gather_dim: the dimension to concatenate + """ + + return all_to_all_comm(ts, SequenceParallelManager.get_sp_group(), scatter_dim, gather_dim) + diff --git a/Meissonic/InfinityStar/infinity/utils/video_decoder.py b/Meissonic/InfinityStar/infinity/utils/video_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..1d93dffa97b9249f2ffaf96139a143877b0a13a4 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/video_decoder.py @@ -0,0 +1,290 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +from abc import ABC, abstractmethod +import io +import math +import numpy as np +from typing import Optional, TypeVar, Union +import collections + +try: + import decord +except ImportError: + _HAS_DECORD = False +else: + _HAS_DECORD = True + +if _HAS_DECORD: + decord.bridge.set_bridge('native') + +DecordDevice = TypeVar("DecordDevice") + +# https://github.com/dmlc/decord/issues/208#issuecomment-1157632702 +class VideoReaderWrapper(decord.VideoReader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.seek(0) + + def __getitem__(self, key): + frames = super().__getitem__(key) + self.seek(0) + return frames + + +class Video(ABC): + """ + Video provides an interface to access clips from a video container. + """ + + @abstractmethod + def __init__( + self, + file: Union[str, io.IOBase], + video_name: Optional[str] = None, + decode_audio: bool = True, + ) -> None: + """ + Args: + file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that + contains the encoded video. + """ + pass + + @property + @abstractmethod + def duration(self) -> float: + """ + Returns: + duration of the video in seconds + """ + pass + + @abstractmethod + def get_clip( + self, start_sec: float, end_sec: float, num_samples: int + ): + """ + Retrieves frames from the internal video at the specified start and end times + in seconds (the video always starts at 0 seconds). + + Args: + start_sec (float): the clip start time in seconds + end_sec (float): the clip end time in seconds + Returns: + video_data_dictonary: A dictionary mapping strings to tensor of the clip's + underlying data. + + """ + pass + + def close(self): + pass + + +class EncodedVideoDecord(Video): + """ + + Accessing clips from an encoded video using Decord video reading API + as the decoding backend. For more details, please refer to - + `Decord ` + """ + + def __init__( + self, + file: Union[str, io.IOBase], + video_name: Optional[str] = None, + width: int = -1, + height: int = -1, + num_threads: int = 0, + fault_tol: int = -1, + ) -> None: + """ + Args: + file str: file path. + video_name (str): An optional name assigned to the video. + decode_audio (bool): If disabled, audio is not decoded. + sample_rate: int, default is -1 + Desired output sample rate of the audio, unchanged if `-1` is specified. + mono: bool, default is True + Desired output channel layout of the audio. `True` is mono layout. `False` + is unchanged. + width : int, default is -1 + Desired output width of the video, unchanged if `-1` is specified. + height : int, default is -1 + Desired output height of the video, unchanged if `-1` is specified. + num_threads : int, default is 0 + Number of decoding thread, auto if `0` is specified. + fault_tol : int, default is -1 + The threshold of corupted and recovered frames. This is to prevent silent fault + tolerance when for example 50% frames of a video cannot be decoded and duplicate + frames are returned. You may find the fault tolerant feature sweet in many + cases, but not for training models. Say `N = # recovered frames` + If `fault_tol` < 0, nothing will happen. + If 0 < `fault_tol` < 1.0, if N > `fault_tol * len(video)`, + raise `DECORDLimitReachedError`. + If 1 < `fault_tol`, if N > `fault_tol`, raise `DECORDLimitReachedError`. + """ + self._video_name = video_name + if not _HAS_DECORD: + raise ImportError( + "decord is required to use EncodedVideoDecord decoder. Please " + "install with 'pip install decord' for CPU-only version and refer to" + "'https://github.com/dmlc/decord' for GPU-supported version" + ) + try: + self._av_reader = VideoReaderWrapper( + uri=file, + ctx=decord.cpu(0), + width=width, + height=height, + num_threads=num_threads, + fault_tol=fault_tol, + ) + except Exception as e: + raise RuntimeError(f"Failed to open video {video_name} with Decord. {e}") + + self._fps = self._av_reader.get_avg_fps() + self._duration = float(len(self._av_reader)) / float(self._fps) + + @property + def name(self) -> Optional[str]: + """ + Returns: + name: the name of the stored video if set. + """ + return self._video_name + + @property + def duration(self) -> float: + """ + Returns: + duration: the video's duration/end-time in seconds. + """ + return self._duration + + def close(self): + if self._av_reader is not None: + del self._av_reader + self._av_reader = None + + def get_clip( + self, start_sec: float, end_sec: float, num_samples: int + ): + """ + Retrieves frames from the encoded video at the specified start and end times + in seconds (the video always starts at 0 seconds). + + Args: + start_sec (float): the clip start time in seconds + end_sec (float): the clip end time in seconds + Returns: + clip_data: + A dictionary mapping the entries at "video" and "audio" to a tensors. + + "video": A tensor of the clip's RGB frames with shape: + (channel, time, height, width). The frames are of type torch.float32 and + in the range [0 - 255]. + + Returns None if no video or audio found within time range. + + """ + if start_sec > end_sec or start_sec > self._duration: + raise RuntimeError( + f"Incorrect time window for Decord decoding for video: {self._video_name}." + ) + + start_idx = math.ceil(self._fps * start_sec) + end_idx = math.ceil(self._fps * end_sec) + end_idx = min(end_idx, len(self._av_reader)) + # frame_idxs = list(range(start_idx, end_idx)) + + frame_idxs = np.linspace(start_idx, end_idx - 1, num_samples, dtype=int) + + try: + outputs = self._av_reader.get_batch(frame_idxs) + return outputs.asnumpy(), frame_idxs - frame_idxs[0] + except Exception as e: + print(f"Failed to decode video with Decord: {self._video_name}. {e}") + raise e + +try: + import cv2 +except ImportError: + print(f"ERR: import cv2 failed, install cv2 by 'pip install opencv-python'") + +class EncodedVideoOpencv(): + def __init__( + self, + file: Union[str, io.IOBase], + video_name: Optional[str] = None, + width: int = -1, + height: int = -1, + num_threads: int = 0, + fault_tol: int = -1, + ) -> None: + """ + Args: + file str: file path. + video_name (str): An optional name assigned to the video. + width : Not support yet. + height : Not support yet. + num_threads : Not support yet. + fault_tol : Not support yet. + """ + + self._video_name = video_name + self.cap = cv2.VideoCapture(file) + self._fps = self.cap.get(cv2.CAP_PROP_FPS) + self._vlen = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + self._duration = float(self._vlen) / float(self._fps) + + @property + def name(self) -> Optional[str]: + """ + Returns: + name: the name of the stored video if set. + """ + return self._video_name + + @property + def duration(self) -> float: + """ + Returns: + duration: the video's duration/end-time in seconds. + """ + return self._duration + + def __del__(self): + self.close() + + def close(self): + self.cap.release() + + def get_clip( + self, start_sec: float, end_sec: float, num_samples: int + ): + if start_sec > end_sec or start_sec > self._duration: + raise RuntimeError( + f"Incorrect time window for Decord decoding for video: {self._video_name}." + ) + start_idx = math.ceil(self._fps * start_sec) + end_idx = math.ceil(self._fps * end_sec) + end_idx = min(end_idx, self._vlen) + frame_idxs = np.linspace(start_idx, end_idx - 1, num_samples, dtype=int) + frame_idx2freq = collections.defaultdict(int) + for frame_idx in frame_idxs: + frame_idx2freq[frame_idx] += 1 + try: + frames = [] + for i in range(self._vlen): + if i > frame_idxs[-1]: + break + ret, frame = self.cap.read() + if i in frame_idx2freq: + frames.extend([frame] * frame_idx2freq[i]) + frames = np.array(frames).astype(np.uint8) # BGR type + assert len(frames) == num_samples + return frames, frame_idxs - frame_idxs[0] + except Exception as e: + print(f"Failed to decode video with opencv: {self._video_name}. {e}") + raise e diff --git a/Meissonic/InfinityStar/infinity/utils/wandb_utils.py b/Meissonic/InfinityStar/infinity/utils/wandb_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e28640fb1ae3b7ed2c6f2f3f6134e60b39786a46 --- /dev/null +++ b/Meissonic/InfinityStar/infinity/utils/wandb_utils.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import wandb +import torch +from torchvision.utils import make_grid +import torch.distributed as dist +from PIL import Image +import os +import argparse +import hashlib +import math + + +def is_main_process(): + return dist.get_rank() == 0 + +def namespace_to_dict(namespace): + return { + k: namespace_to_dict(v) if isinstance(v, argparse.Namespace) else v + for k, v in vars(namespace).items() + } + + +def generate_run_id(exp_name): + # https://stackoverflow.com/questions/16008670/how-to-hash-a-string-into-8-digits + return str(int(hashlib.sha256(exp_name.encode('utf-8')).hexdigest(), 16) % 10 ** 8) + + +def initialize(args, entity, exp_name, project_name): + config_dict = namespace_to_dict(args) + wandb.login(key=os.environ["WANDB_KEY"]) + wandb.init( + entity=entity, + project=project_name, + name=exp_name, + config=config_dict, + id=generate_run_id(exp_name), + resume="allow", + ) + + +def log(stats, step=None): + if is_main_process(): + wandb.log({k: v for k, v in stats.items()}, step=step) + + +def log_image(name, sample, step=None): + if is_main_process(): + sample = array2grid(sample) + wandb.log({f"{name}": wandb.Image(sample), "train_step": step}) + + +def array2grid(x): + nrow = round(math.sqrt(x.size(0))) + x = make_grid(x, nrow=nrow, normalize=True, value_range=(-1,1)) + x = x.mul(255).add_(0.5).clamp_(0,255).permute(1,2,0).to('cpu', torch.uint8).numpy() + return x \ No newline at end of file diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_0.png b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_0.png new file mode 100644 index 0000000000000000000000000000000000000000..4f2998747a8536b507c83ff627ef94604969a8ed --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9333e848ec20104afc2bb061a2b439a957ce797fa26d96ab209c4334d371c6 +size 5556935 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_1.png b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_1.png new file mode 100644 index 0000000000000000000000000000000000000000..dba85965cb0743fd58dc630633e75a37ca499394 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97ed884be78b1fa427e15f853164b22111986738a519d23bcc6dc7143027ddb8 +size 7383072 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_2.png b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_2.png new file mode 100644 index 0000000000000000000000000000000000000000..aec15670b1d67b6b3fcfc36082341309b9d77877 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489f270098d85a55e249c4ea70170cbd443af5bce72a0d425c3076b69b8873a8 +size 8076436 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_3.png b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_3.png new file mode 100644 index 0000000000000000000000000000000000000000..08a9e7b41599de10590e351742a23928e3ae9793 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2f35512405c89454cb2de242824e3ba64471a2490ba9b5534df481de2ac1379 +size 8018172 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_0.mp4 b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..773343e61230b780ba26befeb5e6b6ca0ef1a383 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_0.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5d2f46abd3d9be9317a8d9637c2e53ecc921a9bf2e0e00c57ac407a5c43fb8 +size 310201 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_1.mp4 b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d67f3c9b7eadded0f14943389dad69c50bd38aaf --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77b1b29e450e09d5a03583f65847d0139d999e05bc0258f898712c2142ea0ed9 +size 846593 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_2.mp4 b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..512bb5362f191c5f5883d4f83ed35abcb1525f55 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_2.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae2987646554cb82e471c7789e1bca21727418e5d54e59d111b6eef0846a763 +size 1260796 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_3.mp4 b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6347f75d15f0f4a9a643bb096f3a19f517239d66 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_3.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b576548491fbaf503f96dfbdefb83894c26e9674642c33653f6a6617667c91 +size 779363 diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_0.txt b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9740182d2e06a6159802e0dd8efe3229aefb8f5 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_0.txt @@ -0,0 +1,12 @@ +Video Index: 0 +Video Path: 000/000/000/0.mp4 +Caption: In the video, a man is seen in a living room setting, standing in front of a window with blinds. He is wearing a black sweater and appears to be in the middle of a conversation. The room is dimly lit, with a lamp providing a soft glow in the background. The man's expression is serious, suggesting that the conversation is of importance. The overall style of the video is realistic and naturalistic, capturing a candid moment in the man's life. + +=== Metrics === +Average PSNR: 23.08 dB +Average MSE: 0.009946 +Average SSIM: 0.9354 + +Per-frame PSNR: [18.016685485839844, 12.163713455200195, 16.593429565429688, 24.22454833984375, 23.36005973815918, 24.178247451782227, 25.260787963867188, 23.853004455566406, 26.29904556274414, 25.639766693115234, 27.015087127685547, 25.42237091064453, 27.976186752319336] +Per-frame MSE: [0.015788154676556587, 0.060761529952287674, 0.021910738199949265, 0.0037804641760885715, 0.004613112658262253, 0.003820983460173011, 0.002977976808324456, 0.0041181244887411594, 0.002344744745641947, 0.0027291239239275455, 0.0019883429631590843, 0.002869214164093137, 0.001593607128597796] +Per-frame SSIM: [0.9022817611694336, 0.6287797689437866, 0.8499270677566528, 0.9727839231491089, 0.9670426845550537, 0.9726088047027588, 0.9786661267280579, 0.9705071449279785, 0.9831254482269287, 0.9803852438926697, 0.9857344031333923, 0.9793725609779358, 0.9885582327842712] diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_1.txt b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..0771ff99f96f7e520462956fbff48f308d61cb2c --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_1.txt @@ -0,0 +1,12 @@ +Video Index: 1 +Video Path: 000/000/001/1.mp4 +Caption: The video shows a man standing next to a purple van with a floral design on the side. The man is wearing a black t-shirt and jeans, and he is smiling and waving his hands in the air. The van has pink rims and a black roof rack. The van is parked in front of a building with a glass door. The man appears to be happy and excited about the van. The video is likely a short clip of a man showing off his van. + +=== Metrics === +Average PSNR: 14.09 dB +Average MSE: 0.039555 +Average SSIM: 0.6330 + +Per-frame PSNR: [16.28582763671875, 14.689748764038086, 14.398616790771484, 13.97947883605957, 13.90301513671875, 14.176526069641113, 14.042573928833008, 13.736175537109375, 13.47714900970459, 13.593497276306152, 13.252386093139648, 13.314827919006348, 14.291234016418457] +Per-frame MSE: [0.023518914356827736, 0.03396449238061905, 0.03631936386227608, 0.03999926522374153, 0.040709760040044785, 0.038224995136260986, 0.03942235931754112, 0.04230409488081932, 0.044904015958309174, 0.04371698945760727, 0.04728913679718971, 0.04661409184336662, 0.037228599190711975] +Per-frame SSIM: [0.7587693333625793, 0.6571080684661865, 0.6396705508232117, 0.6090642809867859, 0.6141737103462219, 0.6303208470344543, 0.6249096393585205, 0.6111708283424377, 0.5969941020011902, 0.6141420602798462, 0.5875881314277649, 0.6006293296813965, 0.6844276189804077] diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_2.txt b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..995b9f5301ab5aa8ac0d7cdc02d66f02a62b8d9f --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_2.txt @@ -0,0 +1,12 @@ +Video Index: 2 +Video Path: 000/000/002/2.mp4 +Caption: The video is a news segment featuring a man in a red baseball cap and a blue vest, standing in front of a statue of a soldier and two children. The man appears to be a veteran, as indicated by the cap and the context of the event. The event is an honorary ceremony for lost submarines and submarine veterans, taking place near the World Peace Bell in Newport. The news segment is titled "Connected to the Community" and is scheduled to air at 11:10 PM on ABC 9. The style of the video is informative and respectful, focusing on the man and the event, with a clear and concise presentation of the details. + +=== Metrics === +Average PSNR: 14.80 dB +Average MSE: 0.038476 +Average SSIM: 0.7774 + +Per-frame PSNR: [17.786073684692383, 17.306442260742188, 16.448156356811523, 17.898679733276367, 16.159894943237305, 12.169492721557617, 10.909040451049805, 13.182701110839844, 16.505319595336914, 11.525884628295898, 12.610316276550293, 15.05363655090332, 14.787336349487305] +Per-frame MSE: [0.016649171710014343, 0.01859327033162117, 0.02265605889260769, 0.016223028302192688, 0.024210870265960693, 0.06068072468042374, 0.0811140313744545, 0.048054039478302, 0.022359810769557953, 0.07037387788295746, 0.05482371523976326, 0.031234625726938248, 0.03320980817079544] +Per-frame SSIM: [0.8901564478874207, 0.8773940205574036, 0.846003532409668, 0.8839925527572632, 0.8481070399284363, 0.6637027263641357, 0.5538939237594604, 0.7319289445877075, 0.8778769969940186, 0.6086230278015137, 0.6928645372390747, 0.823104739189148, 0.8090554475784302] diff --git a/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_3.txt b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b7c641ca5e67d22efe687fa4bc603197c5e1051 --- /dev/null +++ b/Meissonic/InfinityStar/infinity_vqvae_test_output/metrics_video_3.txt @@ -0,0 +1,12 @@ +Video Index: 3 +Video Path: 000/000/003/3.mp4 +Caption: The video features a man in a pink shirt and a black bucket hat, wearing glasses and a necklace. He is holding a spoon and making a playful face, as if he is about to eat something. The background shows a lush garden with trees and a wooden structure. The man's expression and the spoon suggest that he is about to taste something, possibly food. The overall style of the video is casual and fun, with a focus on the man's reaction to the food. + +=== Metrics === +Average PSNR: 17.33 dB +Average MSE: 0.020375 +Average SSIM: 0.8816 + +Per-frame PSNR: [18.200925827026367, 15.063268661499023, 14.665642738342285, 14.82711410522461, 15.583945274353027, 15.10338020324707, 18.969524383544922, 18.090055465698242, 18.42655372619629, 18.669588088989258, 17.83605194091797, 19.836143493652344, 20.00314712524414] +Per-frame MSE: [0.015132389031350613, 0.031165430322289467, 0.034153539687395096, 0.032907020300626755, 0.027644287794828415, 0.030878914520144463, 0.012677906081080437, 0.015523666515946388, 0.014366290532052517, 0.01358442660421133, 0.016458677127957344, 0.010384500958025455, 0.009992755949497223] +Per-frame SSIM: [0.9092389345169067, 0.8160971999168396, 0.797990620136261, 0.8093288540840149, 0.8389173746109009, 0.8193320631980896, 0.9264860153198242, 0.9109728336334229, 0.9173861742019653, 0.9218995571136475, 0.9083372354507446, 0.9410321116447449, 0.9432712197303772] diff --git a/Meissonic/InfinityStar/infinitystar_videovae.pth b/Meissonic/InfinityStar/infinitystar_videovae.pth new file mode 100644 index 0000000000000000000000000000000000000000..42c535f92db22a5a8beaf84201e69d5e3b4a0219 --- /dev/null +++ b/Meissonic/InfinityStar/infinitystar_videovae.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc4d7248f463ed8af63ae41755a036973c1a248b9073bef3689a19b3d38772b2 +size 738766996 diff --git a/Meissonic/InfinityStar/requirements.txt b/Meissonic/InfinityStar/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b3ee910e675345aef6fba52c033dc2ccd7fabc3 --- /dev/null +++ b/Meissonic/InfinityStar/requirements.txt @@ -0,0 +1,29 @@ +easydict +typed-argument-parser +seaborn +kornia +gputil +colorama +omegaconf +pandas +timm==0.9.6 +decord +transformers +torch==2.5.1 +pytz +pandas +wandb +colorama +imageio +einops +openai +httpx==0.20.0 +opencv-python +byted-omnistore +imageio-ffmpeg +imageio +kornia +tenacity +decord +beartype +fairscale \ No newline at end of file diff --git a/Meissonic/InfinityStar/scripts/extract_video_features.sh b/Meissonic/InfinityStar/scripts/extract_video_features.sh new file mode 100644 index 0000000000000000000000000000000000000000..93c1f28a9e8fa16636c7b14274c2d7a1badfdeed --- /dev/null +++ b/Meissonic/InfinityStar/scripts/extract_video_features.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +set -x + +# Configure different distributed environment variables according to your platform. +# nccl setting +unset NCCL_NET_PLUGIN +unset NCCL_FASTRAK_ENABLE +unset NCCL_FASTRAK_USE_SNAP +unset NCCL_FASTRAK_NUM_FLOWS +unset NCCL_FASTRAK_* +export NCCL_P2P_LEVEL=NVL +export NCCL_IB_DISABLE=1 +export NCCL_NET_DISABLE=1 +export NCCL_NET_GDR_LEVEL=0 +export NCCL_NET_PLUGIN=NONE +export NCCL_NVLS_ENABLE=0 + +# arnold setting +unset ARNOLD_WORKER_0_HOST + +TORCHRUN_RDZV_READ_TIMEOUT=${TORCHRUN_RDZV_READ_TIMEOUT:-600} +ARNOLD_ID=${ARNOLD_ID:-0} +ARNOLD_WORKER_0_HOST=${ARNOLD_WORKER_0_HOST:-'localhost'} +ARNOLD_WORKER_0_PORT=${ARNOLD_WORKER_0_PORT:-'9591'} +RUN_COMMAND=${@:2} +PORT=$(echo "$ARNOLD_WORKER_0_PORT" | cut -d "," -f 1) +########--------------------------------------------------------------------------------------------------- + + +export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" +export TORCHINDUCTOR_COMPILE_THREADS=1 +export OMP_NUM_THREADS=8 + +# set exp args +video_data_path='./data/infinitystar_toy_data/split_jsonls' + +checkpoints_dir='./' +vae_path="${checkpoints_dir}infinitystar_videovae.pth" +token_cache_dir=./checkpoints/local_cache/cached_visual_tokens_720p + +# --pn: 0.90M for 720p, 0.40M for 480p \ + + +torchrun --nproc_per_node=$ARNOLD_WORKER_GPU \ + --nnodes=$ARNOLD_WORKER_NUM \ + --master_addr=$ARNOLD_WORKER_0_HOST \ + --node_rank=$ARNOLD_ID \ + --master_port=$PORT \ + --rdzv_conf=read_timeout=$TORCHRUN_RDZV_READ_TIMEOUT \ +tools/save_dataset_features.py \ +--pn 0.90M \ +--video_data_path=${video_data_path} \ +--video_frames 81 \ +--video_fps 16 \ +--vae_type=64 \ +--videovae=10 \ +--apply_spatial_patchify 1 \ +--image_batch_size=16 \ +--video_batch_size=1 \ +--use_slice 1 \ +--dataloader_workers=16 \ +--vae_path=${vae_path} \ +--dynamic_scale_schedule infinity_star_extract_features \ +--token_cache_dir=${token_cache_dir} \ +--video_caption_type='tarsier2_caption' \ +--only_images4extract_feats 0 \ +--train_max_token_len=-1 \ +--drop_long_video=0 \ +--min_video_frames=-1 \ +--cache_check_mode=-2 \ +--restrict_data_size=-1 \ +--use_feat_proj=2 \ +--use_two_stage_lfq=1 \ +--seed=1452 \ + + diff --git a/Meissonic/InfinityStar/scripts/train_480p.sh b/Meissonic/InfinityStar/scripts/train_480p.sh new file mode 100644 index 0000000000000000000000000000000000000000..affaab6ec052cfe22564044ce55c4e8d94210249 --- /dev/null +++ b/Meissonic/InfinityStar/scripts/train_480p.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +set -x + +# Configure different distributed environment variables according to your platform. +# nccl setting +unset NCCL_NET_PLUGIN +unset NCCL_FASTRAK_ENABLE +unset NCCL_FASTRAK_USE_SNAP +unset NCCL_FASTRAK_NUM_FLOWS +unset NCCL_FASTRAK_* +export NCCL_P2P_LEVEL=NVL +export NCCL_IB_DISABLE=1 +export NCCL_NET_DISABLE=1 +export NCCL_NET_GDR_LEVEL=0 +export NCCL_NET_PLUGIN=NONE +export NCCL_NVLS_ENABLE=0 + +# arnold setting +unset ARNOLD_WORKER_0_HOST + +TORCHRUN_RDZV_READ_TIMEOUT=${TORCHRUN_RDZV_READ_TIMEOUT:-600} +ARNOLD_ID=${ARNOLD_ID:-0} +ARNOLD_WORKER_0_HOST=${ARNOLD_WORKER_0_HOST:-'localhost'} +ARNOLD_WORKER_0_PORT=${ARNOLD_WORKER_0_PORT:-'9591'} +RUN_COMMAND=${@:2} +PORT=$(echo "$ARNOLD_WORKER_0_PORT" | cut -d "," -f 1) +########--------------------------------------------------------------------------------------------------- + + + +# torch setting +export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" +export PYTHONPATH=infinity/models/:$PYTHONPATH +export TORCHINDUCTOR_COMPILE_THREADS=1 +export OMP_NUM_THREADS=8 + +wandb offline +# wandb online +exp_name=overfitting_var_dbg +bed_path=./checkpoints/${exp_name}/ +video_data_path='./data/infinitystar_toy_data/split_jsonls' + +checkpoints_dir='./' +t5_path="${checkpoints_dir}text_encoder/flan-t5-xl-official" +vae_path="${checkpoints_dir}infinitystar_videovae.pth" +resume_path="${checkpoints_dir}infinitystar_8b_480p_weights" +vae_type=64 +videovae=10 +token_cache_dir=./checkpoints/local_cache/cached_visual_tokens_480p + +local_out_path=$LOCAL_OUT/${exp_name} +video_fps=16 +video_frames=81 + +LOCAL_OUT=checkpoints +mkdir -p $LOCAL_OUT + +# 创建noise_apply_strength列表 +noise_apply_strength=() +noise_apply_strength+=($(printf "0.3\n%.0s" {1..200})) +noise_apply_strength_str=$(IFS=,; echo "${noise_apply_strength[*]}") + +torchrun --nproc_per_node=$ARNOLD_WORKER_GPU \ +--nnodes=$ARNOLD_WORKER_NUM \ +--master_addr=$ARNOLD_WORKER_0_HOST \ +--node_rank=$ARNOLD_ID \ +--master_port=$PORT \ +--rdzv_conf=read_timeout=$TORCHRUN_RDZV_READ_TIMEOUT \ +train.py \ +--local_out_path ${local_out_path} \ +--bed=${bed_path} \ +--data_path=${image_data_path} \ +--video_data_path=${video_data_path} \ +--t5_path=${t5_path} \ +--vae_type=${vae_type} \ +--videovae=${videovae} \ +--vae_path=${vae_path} \ +--token_cache_dir=${token_cache_dir} \ +--tlr=4e-5 \ +--pn 0.40M \ +--model=infinity_qwen8b \ +--project_name=infinity \ +--exp_name=${exp_name} \ +--checkpoint_type='torch' \ +--enable_checkpointing=full-block \ +--video_fps=${video_fps} \ +--video_frames=${video_frames} \ +--short_cap_prob 0.3 \ +--use_streaming_dataset 1 \ +--iterable_data_buffersize 1000 \ +--enable_dynamic_length_prompt 1 \ +--reweight_loss_by_scale 4 \ +--zero=3 \ +--save_model_iters_freq 200 \ +--noise_apply_strength="$noise_apply_strength_str" \ +--dynamic_scale_schedule=infinity_elegant_clip20frames_v2 \ +--mask_type=infinity_elegant_clip20frames_v2 \ +--use_flex_attn=True \ +--use_vae_token_cache=1 \ +--cache_check_mode=1 \ +--allow_online_vae_feature_extraction=0 \ +--train_with_var_seq_len=1 \ +--video_var_len_prob='[40, 30, 20, 6, 3, 1, 60, 40, 12, 6, 2]' \ +--drop_long_video=0 \ +--image_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]' \ +--video_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1]' \ +--append_duration2caption=1 \ +--wp_it=0 \ +--use_two_stage_lfq=1 \ +--semantic_scale_dim=16 \ +--detail_scale_min_tokens=350 \ +--semantic_scales=11 \ +--allow_less_one_elem_in_seq=1 \ +--use_feat_proj=2 \ +--drop_720p_last_scale=1 \ +--twoclip_alternatingtraining=0 \ +--enable_hybrid_shard=0 \ +--restrict_data_size=-1 \ +--sp_size=1 \ +--torchshard_resume=${resume_path} + + + + diff --git a/Meissonic/InfinityStar/scripts/train_720p.sh b/Meissonic/InfinityStar/scripts/train_720p.sh new file mode 100644 index 0000000000000000000000000000000000000000..e9dd52965ceb7584e72e9e14b5a7f2afed3d7b50 --- /dev/null +++ b/Meissonic/InfinityStar/scripts/train_720p.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +set -x + +####### Configure different distributed environment variables according to your platform. +# nccl setting +unset NCCL_NET_PLUGIN +unset NCCL_FASTRAK_ENABLE +unset NCCL_FASTRAK_USE_SNAP +unset NCCL_FASTRAK_NUM_FLOWS +unset NCCL_FASTRAK_* +export NCCL_P2P_LEVEL=NVL +export NCCL_IB_DISABLE=1 +export NCCL_NET_DISABLE=1 +export NCCL_NET_GDR_LEVEL=0 +export NCCL_NET_PLUGIN=NONE +export NCCL_NVLS_ENABLE=0 + + +# arnold setting +unset ARNOLD_WORKER_0_HOST + +TORCHRUN_RDZV_READ_TIMEOUT=${TORCHRUN_RDZV_READ_TIMEOUT:-600} +ARNOLD_ID=${ARNOLD_ID:-0} +ARNOLD_WORKER_0_HOST=${ARNOLD_WORKER_0_HOST:-'localhost'} +ARNOLD_WORKER_0_PORT=${ARNOLD_WORKER_0_PORT:-'9591'} +RUN_COMMAND=${@:2} +PORT=$(echo "$ARNOLD_WORKER_0_PORT" | cut -d "," -f 1) +########--------------------------------------------------------------------------------------------------- + + + + +# torch setting +export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" +export PYTHONPATH=infinity/models/:$PYTHONPATH +export TORCHINDUCTOR_COMPILE_THREADS=1 +export OMP_NUM_THREADS=8 + +wandb offline +# wandb online +exp_name=debug_release +bed_path=./checkpoints/${exp_name}/ +video_data_path='./data/infinitystar_toy_data/split_jsonls' + +checkpoints_dir='./' +t5_path="${checkpoints_dir}text_encoder/flan-t5-xl-official" +vae_path="${checkpoints_dir}infinitystar_videovae.pth" +resume_path="${checkpoints_dir}infinitystar_8b_720p_weights" +vae_type=64 +videovae=10 +token_cache_dir=./checkpoints/local_cache/cached_visual_tokens_720p + +local_out_path=$LOCAL_OUT/${exp_name} +video_fps=16 +video_frames=81 + +LOCAL_OUT=checkpoints +mkdir -p $LOCAL_OUT + +# 创建noise_apply_strength列表 +noise_apply_strength=() +noise_apply_strength+=($(printf "0.3\n%.0s" {1..200})) +noise_apply_strength_str=$(IFS=,; echo "${noise_apply_strength[*]}") + +torchrun --nproc_per_node=$ARNOLD_WORKER_GPU \ +--nnodes=$ARNOLD_WORKER_NUM \ +--master_addr=$ARNOLD_WORKER_0_HOST \ +--node_rank=$ARNOLD_ID \ +--master_port=$PORT \ +--rdzv_conf=read_timeout=$TORCHRUN_RDZV_READ_TIMEOUT \ +train.py \ +--local_out_path ${local_out_path} \ +--bed=${bed_path} \ +--data_path=${image_data_path} \ +--video_data_path=${video_data_path} \ +--t5_path=${t5_path} \ +--vae_type=${vae_type} \ +--videovae=${videovae} \ +--vae_path=${vae_path} \ +--token_cache_dir=${token_cache_dir} \ +--tlr=4e-5 \ +--pn=0.90M \ +--model=infinity_qwen8b \ +--project_name=infinity \ +--exp_name=${exp_name} \ +--checkpoint_type='torch' \ +--enable_checkpointing=full-block \ +--video_fps=${video_fps} \ +--video_frames=${video_frames} \ +--short_cap_prob 0.3 \ +--use_streaming_dataset=1 \ +--iterable_data_buffersize=1000 \ +--enable_dynamic_length_prompt=1 \ +--reweight_loss_by_scale 4 \ +--zero=3 \ +--save_model_iters_freq 200 \ +--noise_apply_strength="$noise_apply_strength_str" \ +--dynamic_scale_schedule=infinity_elegant_clip20frames_v2 \ +--mask_type=infinity_elegant_clip20frames_v2 \ +--use_flex_attn=True \ +--use_vae_token_cache=1 \ +--cache_check_mode=1 \ +--allow_online_vae_feature_extraction=0 \ +--train_with_var_seq_len=1 \ +--video_var_len_prob='[60, 20, 10, 6, 2, 1, 1]' \ +--drop_long_video=0 \ +--image_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]' \ +--video_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1]' \ +--append_duration2caption=1 \ +--wp_it=0 \ +--use_two_stage_lfq=1 \ +--semantic_scale_dim=16 \ +--detail_scale_min_tokens=750 \ +--semantic_scales=12 \ +--allow_less_one_elem_in_seq=1 \ +--use_feat_proj=2 \ +--restrict_data_size=-1 \ +--enable_hybrid_shard=0 \ +--sp_size=1 \ +--drop_720p_last_scale=1 \ +--torchshard_resume=${resume_path} \ + +# The largest scale of 720p has a long sequence length, which requires sequence parallel for multi-node training. +# --drop_720p_last_scale=0 \ +# --enable_hybrid_shard=1 \ +# --sp_size=2 \ \ No newline at end of file diff --git a/Meissonic/InfinityStar/test_infinity_vqvae.py b/Meissonic/InfinityStar/test_infinity_vqvae.py new file mode 100644 index 0000000000000000000000000000000000000000..537bb21fa4d8cd2ab04d1eb20a61831f62ccc481 --- /dev/null +++ b/Meissonic/InfinityStar/test_infinity_vqvae.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +""" +Test script for InfinityStar VQ-VAE performance. + +This script: +1. Loads a video from the training dataset (same as test_cosmos_vqvae.py) +2. Encodes it using InfinityStar VAE +3. Decodes it back +4. Computes metrics (PSNR, SSIM, MSE) - same as test_cosmos_vqvae.py +5. Creates a side-by-side comparison video +6. Saves the results +""" + +import os +import sys +import torch +import numpy as np +from PIL import Image +import cv2 +from torchvision import transforms +from torchvision.utils import make_grid, save_image + +# Add Meissonic to path FIRST to avoid importing InfinityStar's train.py +meissonic_path = "/mnt/Meissonic" #os.path.join(os.path.dirname(os.path.dirname(__file__)), "Meissonic") +if os.path.exists(meissonic_path): + sys.path.insert(0, meissonic_path) + # Also add Meissonic's train directory to path + meissonic_train_path = os.path.join(meissonic_path, "train") + if os.path.exists(meissonic_train_path): + sys.path.insert(0, meissonic_train_path) + +# Add InfinityStar to path (but after Meissonic) +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# Avoid importing arg_util which depends on 'tap' package (has Python 2 syntax issues) +# Create a simple Args class instead +class SimpleArgs: + """Simple replacement for Args class to avoid tap dependency.""" + def __init__(self): + # Quantizer-related fields: MUST match the checkpoint config + self.semantic_scale_dim = 16 + self.detail_scale_dim = 64 + self.use_learnable_dim_proj = 0 + self.detail_scale_min_tokens = 80 + # IMPORTANT: for infinitystar_videovae.pth this must be 2, + # otherwise the quantizer takes a different feature projection path + # and reconstructions become very blurry. + self.use_feat_proj = 2 + self.semantic_scales = 8 + # VAE-specific attributes + self.vae_path = "" + self.vae_type = 18 + self.videovae = 10 + +# Import load_visual_tokenizer directly, avoiding arg_util import +import sys +import importlib.util + +# Load load_visual_tokenizer function without importing arg_util +def load_visual_tokenizer_safe(args, device=None): + """Load visual tokenizer without importing arg_util.""" + if not device: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if args.vae_type in [8,12,14,16,18,20,24,32,48,64,128]: + schedule_mode = "dynamic" + codebook_dim = args.vae_type + print(f'Load VAE from {args.vae_path}') + + if args.videovae == 10: # absorb patchify + from infinity.models.videovae.models.load_vae_bsq_wan_absorb_patchify import video_vae_model + vae_local = video_vae_model(args.vae_path, schedule_mode, codebook_dim, global_args=args, test_mode=True).to(device) + else: + raise ValueError(f"vae_type {args.vae_type} not supported") + else: + raise ValueError(f"vae_type {args.vae_type} not supported") + return vae_local + +# Import dataset utilities from Meissonic using direct file import to avoid conflicts +try: + # Import directly from Meissonic's train directory to avoid InfinityStar's train.py + import importlib.util + dataset_utils_path = os.path.join(meissonic_path, "train", "dataset_utils.py") + if os.path.exists(dataset_utils_path): + spec = importlib.util.spec_from_file_location("meissonic_dataset_utils", dataset_utils_path) + dataset_utils = importlib.util.module_from_spec(spec) + spec.loader.exec_module(dataset_utils) + OpenVid1MDataset = dataset_utils.OpenVid1MDataset + from transformers import T5Tokenizer + DATASET_AVAILABLE = True + print(f"Loaded dataset utilities from Meissonic: {dataset_utils_path}") + else: + raise ImportError(f"Could not find dataset_utils.py at {dataset_utils_path}") +except Exception as e: + DATASET_AVAILABLE = False + print(f"Warning: Could not import dataset utilities: {e}") + print("Will use direct video loading.") + + +def calculate_psnr(img1, img2, max_val=1.0): + """Calculate PSNR between two images.""" + # Ensure both tensors are on CPU + img1 = img1.cpu() if isinstance(img1, torch.Tensor) else torch.tensor(img1) + img2 = img2.cpu() if isinstance(img2, torch.Tensor) else torch.tensor(img2) + + mse = torch.mean((img1 - img2) ** 2) + if mse == 0: + return float('inf') + psnr = 20 * torch.log10(max_val / torch.sqrt(mse)) + return psnr.item() + + +def calculate_mse(img1, img2): + """Calculate MSE between two images.""" + # Ensure both tensors are on CPU + img1 = img1.cpu() if isinstance(img1, torch.Tensor) else torch.tensor(img1) + img2 = img2.cpu() if isinstance(img2, torch.Tensor) else torch.tensor(img2) + + return torch.mean((img1 - img2) ** 2).item() + + +def calculate_ssim(img1, img2, window_size=11): + """Calculate SSIM between two images (simplified version).""" + # Ensure both tensors are on CPU + img1 = img1.cpu() if isinstance(img1, torch.Tensor) else torch.tensor(img1) + img2 = img2.cpu() if isinstance(img2, torch.Tensor) else torch.tensor(img2) + + # Simple SSIM approximation + C1 = 0.01 ** 2 + C2 = 0.03 ** 2 + + mu1 = img1.mean() + mu2 = img2.mean() + + sigma1_sq = img1.var() + sigma2_sq = img2.var() + sigma12 = ((img1 - mu1) * (img2 - mu2)).mean() + + ssim = ((2 * mu1 * mu2 + C1) * (2 * sigma12 + C2)) / ((mu1**2 + mu2**2 + C1) * (sigma1_sq + sigma2_sq + C2)) + return ssim.item() + + +def video_to_numpy(video_tensor): + """ + Convert video tensor [C, F, H, W] in [0, 1] to numpy array [F, H, W, C] in [0, 255] (RGB). + """ + if isinstance(video_tensor, torch.Tensor): + # [C, F, H, W] -> [F, C, H, W] -> [F, H, W, C] + video_np = video_tensor.permute(1, 0, 2, 3).cpu().numpy() # [F, C, H, W] + video_np = np.transpose(video_np, (0, 2, 3, 1)) # [F, H, W, C] + # Clamp to [0, 1] and convert to [0, 255] + video_np = np.clip(video_np, 0, 1) + video_np = (video_np * 255).astype(np.uint8) + else: + video_np = np.array(video_tensor) + return video_np + + +def create_side_by_side_video(original, reconstructed, output_path, fps=8): + """ + Create a side-by-side comparison video. + + Args: + original: Original video tensor [C, F, H, W] or numpy array + reconstructed: Reconstructed video tensor [C, F, H, W] or numpy array + output_path: Path to save the output video + fps: Frames per second + """ + # Convert to numpy (RGB format: [F, H, W, C]) + orig_np = video_to_numpy(original) + recon_np = video_to_numpy(reconstructed) + + # Get dimensions + F, H, W, C = orig_np.shape + F_recon, H_recon, W_recon, C_recon = recon_np.shape + + # Ensure same number of frames + F_min = min(F, F_recon) + orig_np = orig_np[:F_min] + recon_np = recon_np[:F_min] + + # Resize if needed + if (H, W) != (H_recon, W_recon): + recon_np = np.array([cv2.resize(frame, (W, H)) for frame in recon_np]) + + # Create side-by-side frames + comparison_frames = [] + for t in range(F_min): + orig = orig_np[t] + recon = recon_np[t] + + # Add text labels + orig_labeled = orig.copy() + recon_labeled = recon.copy() + cv2.putText(orig_labeled, "Original", (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) + cv2.putText(recon_labeled, "Reconstructed", (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2) + + # Concatenate horizontally + side_by_side = np.concatenate([orig_labeled, recon_labeled], axis=1) + comparison_frames.append(side_by_side) + + # Save video + if len(comparison_frames) == 0: + raise ValueError("No frames to save") + + height, width = comparison_frames[0].shape[:2] + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) + + for frame in comparison_frames: + # Convert RGB to BGR for OpenCV + frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + out.write(frame_bgr) + + out.release() + print(f"Saved side-by-side video to: {output_path}") + + +def add_text_to_image(image_tensor, text, position=(10, 30)): + """ + Add text label to an image tensor. + + Args: + image_tensor: Image tensor [C, H, W] in [0, 1] + text: Text to add + position: (x, y) position for text + Returns: + Image tensor with text [C, H, W] + """ + # Convert to PIL Image + image_np = image_tensor.permute(1, 2, 0).cpu().numpy() # [H, W, C] + image_np = np.clip(image_np, 0, 1) + image_np = (image_np * 255).astype(np.uint8) + pil_image = Image.fromarray(image_np) + + # Add text + from PIL import ImageDraw, ImageFont + draw = ImageDraw.Draw(pil_image) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 24) + except: + try: + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) + except: + font = ImageFont.load_default() + + # Draw white text with black outline + x, y = position + # Draw outline + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + # Draw main text + draw.text((x, y), text, font=font, fill=(255, 255, 255)) + + # Convert back to tensor + image_tensor = transforms.ToTensor()(pil_image) + return image_tensor + + +def create_comparison_grid(original, reconstructed, output_path, nrow=4): + """ + Create a grid image comparing original and reconstructed frames. + + Args: + original: Original video tensor [C, F, H, W] + reconstructed: Reconstructed video tensor [C, F, H, W] + output_path: Path to save the grid image + nrow: Number of frames per row + """ + # Get number of frames + F = min(original.shape[1], reconstructed.shape[1]) + + # Select frames to display (same as test_cosmos_vqvae.py) + num_frames_to_show = min(8, F) + frame_indices = np.linspace(0, F - 1, num_frames_to_show, dtype=int) + + frames_list = [] + for idx in frame_indices: + # Original frame with label + orig_frame = original[:, idx, :, :].clone() # [C, H, W] + orig_frame = add_text_to_image(orig_frame, "Original", position=(10, 10)) + frames_list.append(orig_frame) + + # Reconstructed frame with label + recon_frame = reconstructed[:, idx, :, :].clone() # [C, H, W] + recon_frame = add_text_to_image(recon_frame, "Reconstructed", position=(10, 10)) + frames_list.append(recon_frame) + + # Create grid (nrow * 2 because each frame has original and reconstructed) + frames_tensor = torch.stack(frames_list, dim=0) + grid = make_grid(frames_tensor, nrow=nrow * 2, padding=2, pad_value=1.0) + + save_image(grid, output_path) + print(f"Saved comparison grid to: {output_path}") + + +def main(): + # Direct paths (like test_cosmos_vqvae.py) + # Modify these paths according to your setup + VAE_PATH = "/mnt/Meissonic/InfinityStar/infinitystar_videovae.pth" # Update this path + VAE_TYPE = 18 # codebook_dim + VIDEOVAE = 10 # absorb patchify + + # Dataset paths (same as test_cosmos_vqvae.py) + CSV_PATH = "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" # Update this path + VIDEO_ROOT_DIR = None # Auto-detect if None + VIDEO_INDEX = 3 # Index of video to test + + # Video parameters (same as test_cosmos_vqvae.py) + NUM_FRAMES = 16 + HEIGHT = 480 + WIDTH = 848 + + # Output + OUTPUT_DIR = "./infinity_vqvae_test_output" + DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + DTYPE = "float32" + + # Create output directory + os.makedirs(OUTPUT_DIR, exist_ok=True) + + # Set device and dtype + device = torch.device(DEVICE) + if DTYPE == "float16": + dtype = torch.float16 + elif DTYPE == "bfloat16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + + print(f"Using device: {device}, dtype: {dtype}") + + # Load VAE + print("=" * 80) + print("Loading VQ-VAE model...") + print(f" VAE path: {VAE_PATH}") + print(f" VAE type: {VAE_TYPE}") + print(f" Video VAE: {VIDEOVAE}") + print("=" * 80) + + vae_args = SimpleArgs() + vae_args.vae_path = VAE_PATH + vae_args.vae_type = VAE_TYPE + vae_args.videovae = VIDEOVAE + + vae = load_visual_tokenizer_safe(vae_args, device=device) + vae = vae.to(device) + vae.eval() + # Disable gradient computation for all parameters (same as official code) + [p.requires_grad_(False) for p in vae.parameters()] + + print("VAE loaded successfully!") + print(f" Device: {device}") + print(f" Model dtype: {next(vae.parameters()).dtype}") + print(f" Model in eval mode: {not vae.training}") + + # Load dataset (same as test_cosmos_vqvae.py) + if DATASET_AVAILABLE: + print(f"\nLoading dataset from: {CSV_PATH}") + + # Auto-detect video_root_dir if not provided + video_root_dir = VIDEO_ROOT_DIR + if video_root_dir is None: + csv_dir = os.path.dirname(CSV_PATH) + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + video_root_dir = csv_dir + print(f"Warning: Video directory not found, using CSV directory: {video_root_dir}") + + # Initialize tokenizer for dataset + tokenizer = T5Tokenizer.from_pretrained("google/umt5-base") + + # Create dataset + dataset = OpenVid1MDataset( + csv_path=CSV_PATH, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=NUM_FRAMES, + height=HEIGHT, + width=WIDTH, + text_encoder_architecture="umt5-base", + ) + + print(f"Dataset size: {len(dataset)}") + + # Load video + if VIDEO_INDEX >= len(dataset): + print(f"Error: video_index {VIDEO_INDEX} >= dataset size {len(dataset)}") + return + + print(f"Loading video at index {VIDEO_INDEX}...") + sample = dataset[VIDEO_INDEX] + original_video = sample["video"] + + # Ensure video is [C, T, H, W] format (VAE expects this) + if original_video.dim() == 4: + # Check if it's [T, C, H, W] format + if original_video.shape[0] == NUM_FRAMES and original_video.shape[1] == 3: + print(f"Detected [T, C, H, W] format, converting to [C, T, H, W]") + original_video = original_video.permute(1, 0, 2, 3) + # Check if it's [T, H, W, C] format + elif original_video.shape[-1] == 3: + print(f"Detected [T, H, W, C] format, converting to [C, T, H, W]") + original_video = original_video.permute(3, 0, 1, 2) + + # Get video info from dataset + row = dataset.data[VIDEO_INDEX] + video_path = row.get('video', 'unknown') + caption = row.get('caption', 'no caption') + + print(f"Video path: {video_path}") + print(f"Caption: {caption}") + else: + print("Warning: Dataset utilities not available. Using dummy video.") + original_video = torch.rand(3, NUM_FRAMES, HEIGHT, WIDTH) + video_path = "dummy" + caption = "dummy video" + + print(f"Original video shape (C, T, H, W): {original_video.shape}") + print(f"Original video range (from dataset): [{original_video.min():.3f}, {original_video.max():.3f}]") + + # Move to device + video_for_vae = original_video.to(device=device, dtype=dtype) + + # OpenVid1MDataset.process_video normalizes to [0, 1]. + # VAE expects [-1, 1]. + video_for_vae = video_for_vae.clamp(0.0, 1.0) + print("Dataset returns [0, 1], converting to [-1, 1] for VAE") + video_for_vae = video_for_vae * 2.0 - 1.0 + + print(f"Video for VAE range: [{video_for_vae.min():.3f}, {video_for_vae.max():.3f}]") + + # Convert to [B, C, T, H, W] format + video_for_vae = video_for_vae.unsqueeze(0) # [1, C, T, H, W] + + # Encode: Use VAE's official interface (same as test_vae_reconstruction_simple.py) + print("\n" + "=" * 80) + print("Encoding using vae.encode_for_raw_features (InfinityStar's method)...") + print("=" * 80) + + with torch.no_grad(): + # Use InfinityStar's encode_for_raw_features (same as working script) + raw_features, _, _ = vae.encode_for_raw_features( + video_for_vae, + scale_schedule=None, + slice=True + ) + print(f"Encoded latent shape: {raw_features.shape}") + print(f"Encoded latent range: [{raw_features.min().item():.4f}, {raw_features.max().item():.4f}]") + + # Decode: Use VAE's official interface (same as test_vae_reconstruction_simple.py) + print("\n" + "=" * 80) + print("Decoding using vae.decode (InfinityStar's method)...") + print("=" * 80) + + with torch.no_grad(): + # Use InfinityStar's decode (same as working script) + reconstructed_video_batch = vae.decode(raw_features, slice=True) + if isinstance(reconstructed_video_batch, tuple): + reconstructed_video_batch = reconstructed_video_batch[0] + + # Clamp like in InfinityStar's code (same as working script) + reconstructed_video_batch = torch.clamp(reconstructed_video_batch, min=-1, max=1) + + print(f"Reconstructed shape: {reconstructed_video_batch.shape}") + print(f"Reconstructed range: [{reconstructed_video_batch.min():.3f}, {reconstructed_video_batch.max():.3f}]") + + # Convert back to [C, F, H, W] format + reconstructed_video = reconstructed_video_batch.squeeze(0) # [C, T, H, W] = [C, F, H, W] + + # Normalize reconstructed video to [0, 1] for visualization + # Check if output is in [-1, 1] or [0, 1] + if reconstructed_video.min() < 0: + print("Reconstructed video is in [-1, 1], converting to [0, 1]") + reconstructed_video_01 = (reconstructed_video + 1.0) / 2.0 + else: + print("Reconstructed video is already in [0, 1]") + reconstructed_video_01 = reconstructed_video.clone() + reconstructed_video_01 = torch.clamp(reconstructed_video_01, 0, 1) + print(f"Reconstructed video [0, 1] range: [{reconstructed_video_01.min():.3f}, {reconstructed_video_01.max():.3f}]") + + # Normalize original video to [0, 1] for visualization + original_video_01 = original_video.clone().to(device=device) + if original_video_01.min() < 0: + original_video_01 = (original_video_01 + 1.0) / 2.0 + elif original_video_01.max() > 1.0: + original_video_01 = original_video_01 / 255.0 + original_video_01 = torch.clamp(original_video_01, 0, 1) + print(f"Original video [0, 1] range: [{original_video_01.min():.3f}, {original_video_01.max():.3f}]") + + # Ensure same number of frames for comparison + F_orig = original_video_01.shape[1] + F_recon = reconstructed_video_01.shape[1] + F_min = min(F_orig, F_recon) + + if F_orig != F_recon: + print(f"Frame count mismatch: original={F_orig}, reconstructed={F_recon}, using first {F_min} frames for comparison") + print(" (This is normal for VAE with temporal compression)") + + original_video_01 = original_video_01[:, :F_min, :, :] + reconstructed_video_01 = reconstructed_video_01[:, :F_min, :, :] + + # Resize if spatial dimensions don't match + if original_video_01.shape[2:] != reconstructed_video_01.shape[2:]: + print(f"Resizing reconstructed video from {reconstructed_video_01.shape[2:]} to {original_video_01.shape[2:]}") + # Use interpolation to resize + reconstructed_video_resized = torch.zeros_like(original_video_01) + for f in range(F_min): + frame = reconstructed_video_01[:, f, :, :].unsqueeze(0) # [1, C, H, W] + frame_resized = torch.nn.functional.interpolate( + frame, size=original_video_01.shape[2:], mode='bilinear', align_corners=False + ) + reconstructed_video_resized[:, f, :, :] = frame_resized.squeeze(0) + reconstructed_video_01 = reconstructed_video_resized + + # Calculate metrics (same as test_cosmos_vqvae.py) + print("\nCalculating metrics...") + + # Convert to float32 for metric calculation (already in [0, 1]) + orig_f32 = original_video_01.to(torch.float32) + recon_f32 = reconstructed_video_01.to(torch.float32) + + # Frame-wise metrics + psnr_values = [] + mse_values = [] + ssim_values = [] + + for f in range(F_min): + orig_frame = orig_f32[:, f, :, :] # [C, H, W] + recon_frame = recon_f32[:, f, :, :] # [C, H, W] + + psnr = calculate_psnr(orig_frame, recon_frame) + mse = calculate_mse(orig_frame, recon_frame) + ssim = calculate_ssim(orig_frame, recon_frame) + + psnr_values.append(psnr) + mse_values.append(mse) + ssim_values.append(ssim) + + # Overall metrics + avg_psnr = np.mean(psnr_values) + avg_mse = np.mean(mse_values) + avg_ssim = np.mean(ssim_values) + + print(f"\n=== Metrics ===") + print(f"PSNR: {avg_psnr:.2f} dB (per frame: {psnr_values})") + print(f"MSE: {avg_mse:.6f} (per frame: {mse_values})") + print(f"SSIM: {avg_ssim:.4f} (per frame: {ssim_values})") + + # Save metrics to file + metrics_file = os.path.join(OUTPUT_DIR, f"metrics_video_{VIDEO_INDEX}.txt") + with open(metrics_file, 'w') as f: + f.write(f"Video Index: {VIDEO_INDEX}\n") + f.write(f"Video Path: {video_path}\n") + f.write(f"Caption: {caption}\n") + f.write(f"\n=== Metrics ===\n") + f.write(f"Average PSNR: {avg_psnr:.2f} dB\n") + f.write(f"Average MSE: {avg_mse:.6f}\n") + f.write(f"Average SSIM: {avg_ssim:.4f}\n") + f.write(f"\nPer-frame PSNR: {psnr_values}\n") + f.write(f"Per-frame MSE: {mse_values}\n") + f.write(f"Per-frame SSIM: {ssim_values}\n") + + print(f"Saved metrics to: {metrics_file}") + + # Create side-by-side video + print("\nCreating side-by-side comparison video...") + video_output_path = os.path.join(OUTPUT_DIR, f"comparison_video_{VIDEO_INDEX}.mp4") + create_side_by_side_video(original_video_01, reconstructed_video_01, video_output_path, fps=8) + + # Create comparison grid + print("Creating comparison grid...") + grid_output_path = os.path.join(OUTPUT_DIR, f"comparison_grid_video_{VIDEO_INDEX}.png") + create_comparison_grid(original_video_01, reconstructed_video_01, grid_output_path, nrow=4) + + print(f"\n=== Test Complete ===") + print(f"Results saved to: {OUTPUT_DIR}") + print(f" - Metrics: {metrics_file}") + print(f" - Side-by-side video: {video_output_path}") + print(f" - Comparison grid: {grid_output_path}") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/InfinityStar/test_infinity_vqvae.sh b/Meissonic/InfinityStar/test_infinity_vqvae.sh new file mode 100644 index 0000000000000000000000000000000000000000..f84bb16bb73f8a8b7d4793e6714ebfb0aa6d7b2a --- /dev/null +++ b/Meissonic/InfinityStar/test_infinity_vqvae.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Test script for InfinityStar VQ-VAE performance +# Same test standard as test_cosmos_vqvae.sh + +# Update these paths in test_infinity_vqvae.py: +# - VAE_PATH: Path to InfinityStar VAE checkpoint +# - CSV_PATH: Path to OpenVid1M CSV file (same as test_cosmos_vqvae.sh) + +# VAE_PATH = "/mnt/Meissonic/InfinityStar/infinitystar_videovae.pth" +# CSV_PATH = "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" +# VIDEO_INDEX = 0 # 与 test_cosmos_vqvae.py 使用相同的索引 + + +python test_infinity_vqvae.py + diff --git a/Meissonic/InfinityStar/test_vae_reconstruction_simple.py b/Meissonic/InfinityStar/test_vae_reconstruction_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..b24e5dbe0ecb3a575eb8a5ccfb3c9035297dc819 --- /dev/null +++ b/Meissonic/InfinityStar/test_vae_reconstruction_simple.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 +""" +Simple VAE reconstruction test using InfinityStar's own code and video. +This directly uses InfinityStar's encode_for_raw_features and decode methods. +""" + +import os +import sys +import torch +import numpy as np +import cv2 +from PIL import Image +import imageio +from torchvision import transforms +from torchvision.utils import make_grid, save_image + +# Add InfinityStar to path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# Avoid importing arg_util which has tap dependency issues +# Directly import what we need +from infinity.models.videovae.models.wan_bsq_vae import AutoencoderKLCogVideoX +from infinity.utils.video_decoder import EncodedVideoDecord +import argparse + +# Copy video_vae_model function to avoid circular import issues +def video_vae_model(vqgan_ckpt, schedule_mode, codebook_dim, global_args=None, test_mode=True): + """Load VAE model (copied from load_vae_bsq_wan_absorb_patchify.py to avoid import issues).""" + # Handle global_args with defaults + if global_args is None: + # Create a minimal args object with required fields + class MinimalArgs: + semantic_scale_dim = 16 + detail_scale_dim = 64 + use_learnable_dim_proj = 0 + detail_scale_min_tokens = 80 + use_feat_proj = 2 + semantic_scales = 8 + global_args = MinimalArgs() + else: + # Ensure all required fields exist with defaults + if not hasattr(global_args, 'semantic_scale_dim'): + global_args.semantic_scale_dim = getattr(global_args, 'semantic_scale_dim', 16) + if not hasattr(global_args, 'detail_scale_dim'): + global_args.detail_scale_dim = getattr(global_args, 'detail_scale_dim', 64) + if not hasattr(global_args, 'use_learnable_dim_proj'): + global_args.use_learnable_dim_proj = getattr(global_args, 'use_learnable_dim_proj', 0) + if not hasattr(global_args, 'detail_scale_min_tokens'): + global_args.detail_scale_min_tokens = getattr(global_args, 'detail_scale_min_tokens', 80) + if not hasattr(global_args, 'use_feat_proj'): + global_args.use_feat_proj = getattr(global_args, 'use_feat_proj', 2) + if not hasattr(global_args, 'semantic_scales'): + global_args.semantic_scales = getattr(global_args, 'semantic_scales', 8) + + args = argparse.Namespace( + vqgan_ckpt=vqgan_ckpt, + sd_ckpt=None, + use_frames=None, + inference_type='video', + save_prediction=True, + save_dir='results', + intermediate_tensor=True, + save_z=False, + save_frames=False, + image_recon4video=False, + junke_old=False, + cal_norm=False, + save_samples=None, + device='cuda', + noise_scale=0.0, + max_steps=1000000.0, + log_every=1, + ckpt_every=1000, + default_root_dir='/tmp', + compile='no', + ema='no', + mfu_logging='no', + dataloader_init_epoch=-1, + context_parallel_size=0, + video_ranks_ratio=-1.0, + lr=0.0001, + beta1=0.9, + beta2=0.95, + optim_type='Adam', + disc_optim_type=None, + max_grad_norm=1.0, + max_grad_norm_disc=1.0, + disable_sch=False, + scheduler='no', + warmup_steps=0, + lr_min=0.0, + warmup_lr_init=0.0, + patch_size=8, + temporal_patch_size=4, + embedding_dim=256, + codebook_dim=codebook_dim, # Use parameter, not hardcoded 16 + use_vae=True, + eq_scale_prior=0.0, + eq_angle_prior=0.0, + use_stochastic_depth=False, + drop_rate=0.0, + schedule_mode=schedule_mode, + lr_drop=None, + lr_drop_rate=0.1, + keep_first_quant=False, + keep_last_quant=False, + remove_residual_detach=False, + use_out_phi=False, + use_out_phi_res=False, + use_lecam_reg=False, + lecam_weight=0.05, + perceptual_model='vgg16', + base_ch_disc=64, + random_flip=False, + flip_prob=0.5, + flip_mode='stochastic', + max_flip_lvl=1, + not_load_optimizer=False, + use_lecam_reg_zero=False, + freeze_encoder=False, + rm_downsample=False, + random_flip_1lvl=False, + flip_lvl_idx=0, + drop_when_test=False, + drop_lvl_idx=None, + drop_lvl_num=0, + compute_all_commitment=False, + disable_codebook_usage=False, + freeze_enc_main=False, + freeze_dec_main=False, + random_short_schedule=False, + short_schedule_prob=0.5, + use_bernoulli=False, + use_rot_trick=False, + disable_flip_prob=0.0, + dino_disc=False, + quantizer_type='MultiScaleBSQTP', + lfq_weight=0.0, + entropy_loss_weight=0.1, + visu_every=1000, + commitment_loss_weight=0.25, + bsq_version='v1', + diversity_gamma=1, + bs1_for1024=False, + casual_multi_scale=False, + double_compress_t=False, + temporal_slicing=False, + latent_adjust_type=None, + compute_latent_loss=False, + latent_loss_weight=0.0, + use_raw_latentz=False, + last_scale_repeat_n=0, + num_lvl_fsq=5, + use_midscale_sup=False, + midscale_list=[0.5, 0.75, 1.0], + use_eq=False, + eq_prob=0.5, + disc_version='v1', + magvit_disc=False, + disc_type='patchgan', + sigmoid_in_disc=False, + activation_in_disc='leaky_relu', + apply_blur=False, + apply_noise=False, + dis_warmup_steps=0, + dis_lr_multiplier=1.0, + dis_minlr_multiplier=False, + disc_channels=64, + disc_layers=3, + discriminator_iter_start=0, + disc_pretrain_iter=0, + disc_optim_steps=1, + disc_warmup=0, + disc_pool='no', + disc_pool_size=100, + disc_temporal_compress='yes', + disc_use_blur='yes', + disc_stylegan_downsample_base=2, + fix_model=['no'], + recon_loss_type='l1', + image_gan_weight=1.0, + video_gan_weight=1.0, + image_disc_weight=0.0, + video_disc_weight=0.0, + vf_weight=0.0, + vf_weight_approx=-1, + vf_distmat_margin=0.25, + vf_cos_margin=0.5, + temporal_alignment=None, + l1_weight=4.0, + gan_feat_weight=0.0, + lpips_model='vgg', + perceptual_weight=0.0, + video_perceptual_weight=None, + video_perceptual_layers=[], + kl_weight=0.0, + norm_type='rms', + disc_loss_type='hinge', + gan_image4video='yes', + use_checkpoint=False, + precision='fp32', + encoder_dtype='fp32', + decoder_dtype='fp32', + upcast_attention='', + upcast_tf32=False, + tokenizer='cogvideoxd', + pretrained=None, + pretrained_mode='full', + pretrained_ema='no', + inflation_pe=False, + init_vgen='no', + no_init_idis=False, + init_idis='keep', + init_vdis='no', + enable_nan_detector=False, + turn_on_profiler=False, + profiler_scheduler_wait_steps=10, + debug=False, + video_logger=False, + bytenas='sg', + username='bin.yan', + seed=1234, + vq_to_vae=False, + load_not_strict=False, + zero=0, + bucket_cap_mb=40, + manual_gc_interval=10000, + data_path=[''], + data_type=[''], + dataset_list=['wanxvideo-v1'], + fps=[-1], + dataaug='resizecrop', + multi_resolution=False, + random_bucket_ratio=0.0, + sequence_length=81, + resolution=[(480, 864)], + resize_bucket=None, + resize_bucket_use_self='yes', + scaling_aug='no', + batch_size=[1], + num_workers=0, + image_channels=3, + in_channels=3, + out_channels=3, + down_block_types=['CogVideoXDownBlock3D', 'CogVideoXDownBlock3D', 'CogVideoXDownBlock3D', 'CogVideoXDownBlock3D'], + down_block_mode='dc', + up_block_types=['CogVideoXUpBlock3D', 'CogVideoXUpBlock3D', 'CogVideoXUpBlock3D', 'CogVideoXUpBlock3D'], + up_block_mode='dc', + block_out_channels=[96, 192, 384, 384, 384], + layers_per_block=2, + latent_channels=16, + act_fn='silu', + norm_eps=1e-06, + norm_num_groups=32, + spatial_compression_list=[2, 2, 2], + temporal_compression_list=[2, 2], + sample_height=480, + sample_width=720, + use_quant_conv=False, + use_post_quant_conv=False, + down_layer='3d-dc', + down_norm=True, + up_layer='3d-dc', + up_norm=True, + pad_mode='constant', + dropout_z=0.0, + flux_weight=0, + cycle_weight=0, + cycle_feat_weight=0, + cycle_gan_weight=0, + cycle_loop=0, + cycle_norm='no', + cycle_deterministic='no', + cycle_kl_weight=0, + z_drop=0.0, + intermediate_tensor_dir='/tmp', + codebook_dim_low=codebook_dim//4, + freeze_decoder=False, + semantic_scale_dim=global_args.semantic_scale_dim, + detail_scale_dim=global_args.detail_scale_dim, + use_learnable_dim_proj=global_args.use_learnable_dim_proj, + detail_scale_min_tokens=global_args.detail_scale_min_tokens, + use_feat_proj=global_args.use_feat_proj, + semantic_scales=global_args.semantic_scales, + use_multi_scale=0, + quant_not_rely_256=0, + semantic_num_lvl=2, + detail_num_lvl=2, + ) + + vae = AutoencoderKLCogVideoX(args) + state_dict = torch.load(args.vqgan_ckpt, map_location=torch.device("cpu"), weights_only=True) + if args.ema == "yes": + print("testing ema weights") + vae.load_state_dict(state_dict["ema"], strict=False) + else: + vae.load_state_dict(state_dict["vae"], strict=False) + + vae.enable_slicing() + if test_mode: + vae.eval() + [p.requires_grad_(False) for p in vae.parameters()] + return vae + +# Replicate transform function to avoid importing from run_infinity +def transform(pil_img, tgt_h, tgt_w): + """Transform PIL image to tensor, resizing and center cropping (same as run_infinity.py). + Returns tensor in [-1, 1] range. + """ + import PIL.Image as PImage + from torchvision.transforms.functional import to_tensor + width, height = pil_img.size + if width / height <= tgt_w / tgt_h: + resized_width = tgt_w + resized_height = int(tgt_w / (width / height)) + else: + resized_height = tgt_h + resized_width = int((width / height) * tgt_h) + pil_img = pil_img.resize((resized_width, resized_height), resample=PImage.LANCZOS) + # crop the center out + arr = np.array(pil_img) + crop_y = (arr.shape[0] - tgt_h) // 2 + crop_x = (arr.shape[1] - tgt_w) // 2 + im = to_tensor(arr[crop_y: crop_y + tgt_h, crop_x: crop_x + tgt_w]) + # Convert from [0, 1] to [-1, 1]: im * 2 - 1 + return im.add(im).add_(-1) + +# Simple Args class to avoid tap dependency +# Must include all fields required by quantizer initialization +class SimpleArgs: + def __init__(self): + self.vae_path = "" + self.vae_type = 18 + self.videovae = 10 + self.device = 'cuda' + self.encoder_dtype = 'float32' + self.decoder_dtype = 'float32' + + # Quantizer required fields (with defaults) + # These are critical for quantizer initialization in video_vae_model + self.semantic_scale_dim = 16 # Default based on common config + self.detail_scale_dim = 64 # Default based on common config + self.use_learnable_dim_proj = 0 + self.detail_scale_min_tokens = 80 + self.use_feat_proj = 2 # 2 is common for this VAE type + self.semantic_scales = 8 # Number of semantic scales + + +def add_text_to_image(image_tensor, text, position=(10, 30)): + """ + Add text label to an image tensor. + + Args: + image_tensor: Image tensor [C, H, W] in [0, 1] + text: Text to add + position: (x, y) position for text + Returns: + Image tensor with text [C, H, W] + """ + # Convert to PIL Image + image_np = image_tensor.permute(1, 2, 0).cpu().numpy() # [H, W, C] + image_np = np.clip(image_np, 0, 1) + image_np = (image_np * 255).astype(np.uint8) + pil_image = Image.fromarray(image_np) + + # Add text + from PIL import ImageDraw, ImageFont + draw = ImageDraw.Draw(pil_image) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 24) + except: + try: + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) + except: + font = ImageFont.load_default() + + # Draw white text with black outline + x, y = position + # Draw outline + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + # Draw main text + draw.text((x, y), text, font=font, fill=(255, 255, 255)) + + # Convert back to tensor + image_tensor = transforms.ToTensor()(pil_image) + return image_tensor + + +def create_comparison_grid(original, reconstructed, output_path, nrow=4): + """ + Create a grid image comparing original and reconstructed frames. + + Args: + original: Original video tensor [C, F, H, W] + reconstructed: Reconstructed video tensor [C, F, H, W] + output_path: Path to save the grid image + nrow: Number of frames per row + """ + # Get number of frames + F = min(original.shape[1], reconstructed.shape[1]) + + # Select frames to display (same as test_cosmos_vqvae.py) + num_frames_to_show = min(8, F) + frame_indices = np.linspace(0, F - 1, num_frames_to_show, dtype=int) + + frames_list = [] + for idx in frame_indices: + # Original frame with label + orig_frame = original[:, idx, :, :].clone() # [C, H, W] + orig_frame = add_text_to_image(orig_frame, "Original", position=(10, 10)) + frames_list.append(orig_frame) + + # Reconstructed frame with label + recon_frame = reconstructed[:, idx, :, :].clone() # [C, H, W] + recon_frame = add_text_to_image(recon_frame, "Reconstructed", position=(10, 10)) + frames_list.append(recon_frame) + + # Create grid (nrow * 2 because each frame has original and reconstructed) + frames_tensor = torch.stack(frames_list, dim=0) + grid = make_grid(frames_tensor, nrow=nrow * 2, padding=2, pad_value=1.0) + + save_image(grid, output_path) + print(f"Saved comparison grid to: {output_path}") + + +def main(): + # Use InfinityStar's toy video + video_path = "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4" + if not os.path.exists(video_path): + print(f"Video not found: {video_path}") + print("Please run from InfinityStar root directory") + return + + # VAE path + vae_path = "/mnt/Meissonic/InfinityStar/infinitystar_videovae.pth" + if not os.path.exists(vae_path): + print(f"VAE not found: {vae_path}") + return + + print("=" * 80) + print("Loading VAE using InfinityStar's video_vae_model...") + print("=" * 80) + + # Load VAE directly using video_vae_model (same as load_visual_tokenizer but avoids arg_util) + schedule_mode = "dynamic" + codebook_dim = 18 # vae_type + + print(f"Loading VAE from: {vae_path}") + print(f" schedule_mode: {schedule_mode}") + print(f" codebook_dim: {codebook_dim}") + print(f" videovae: 10 (absorb patchify)") + + # Create args with all required fields for video_vae_model + args = SimpleArgs() + args.vae_path = vae_path + args.vae_type = 18 + args.videovae = 10 + + # All required fields are already set in SimpleArgs.__init__ + # But we can override if needed + print(f" semantic_scale_dim: {args.semantic_scale_dim}") + print(f" detail_scale_dim: {args.detail_scale_dim}") + print(f" use_feat_proj: {args.use_feat_proj}") + print(f" semantic_scales: {args.semantic_scales}") + + # Load VAE using video_vae_model directly + vae = video_vae_model(vae_path, schedule_mode, codebook_dim, global_args=args, test_mode=True) + vae = vae.float().to('cuda') + vae.eval() + [p.requires_grad_(False) for p in vae.parameters()] + + print(f"VAE loaded: {type(vae)}") + print(f" Device: {next(vae.parameters()).device}") + print(f" Dtype: {next(vae.parameters()).dtype}") + + print("\n" + "=" * 80) + print("Loading video using InfinityStar's EncodedVideoDecord...") + print("=" * 80) + + # Load video using InfinityStar's video decoder + video = EncodedVideoDecord(video_path, os.path.basename(video_path), num_threads=0) + duration = video._duration + print(f"Video duration: {duration:.2f} seconds") + + # Get first 5 seconds (81 frames at ~16 fps) + num_frames = 81 + raw_video, _ = video.get_clip(0, 5, num_frames) + print(f"Loaded {len(raw_video)} frames") + + # Transform frames like in InfinityStar's code + # Use 480p resolution (480x848 for 16:9) + tgt_h, tgt_w = 384,672 + video_T3HW = [transform(Image.fromarray(frame).convert("RGB"), tgt_h, tgt_w) for frame in raw_video] + video_T3HW = torch.stack(video_T3HW, 0) # [t, 3, h, w] + video_bcthw = video_T3HW.permute(1, 0, 2, 3).unsqueeze(0) # [1, 3, t, h, w] + + print(f"Video tensor shape: {video_bcthw.shape}") + print(f"Video tensor range: [{video_bcthw.min():.3f}, {video_bcthw.max():.3f}]") + + # Check if video is in [0, 1] or [-1, 1] + if video_bcthw.min() >= 0 and video_bcthw.max() <= 1.0: + print("Video is in [0, 1], converting to [-1, 1] for VAE") + video_bcthw = video_bcthw * 2.0 - 1.0 + elif video_bcthw.min() < 0: + print("Video is already in [-1, 1]") + + video_bcthw = video_bcthw.cuda() + print(f"Video for VAE range: [{video_bcthw.min():.3f}, {video_bcthw.max():.3f}]") + + print("\n" + "=" * 80) + print("Encoding using vae.encode_for_raw_features (InfinityStar's method)...") + print("=" * 80) + print("Note: This is a VQ-VAE (Vector Quantized VAE) with quantizer.") + print(" encode_for_raw_features returns continuous latent (for transformer training).") + print(" We will use quantizer to get discrete codes (indices).") + print("=" * 80) + + with torch.no_grad(): + # Use InfinityStar's encode_for_raw_features to get continuous latent + raw_features, _, _ = vae.encode_for_raw_features( + video_bcthw, + scale_schedule=None, + slice=True + ) + print(f"Continuous latent shape: {raw_features.shape}") + print(f"Continuous latent range: [{raw_features.min():.3f}, {raw_features.max():.3f}]") + + # Check if quantizer exists and use it to get discrete codes + if hasattr(vae, 'quantizer') and vae.quantizer is not None: + print(f"\nQuantizer detected: {type(vae.quantizer).__name__}") + print(f"Raw features shape: {raw_features.shape}") + print(f"Quantizer schedule_mode: {vae.quantizer.schedule_mode}") + + B, C, T, H, W = raw_features.shape + print(f"Latent resolution: H={H}, W={W}") + + # List supported resolutions for the current schedule_mode + from infinity.models.videovae.modules.quantizer.multiscale_bsq_tp_absorb_patchify import get_latent2scale_schedule + from infinity.models.videovae.utils.dynamic_resolution import predefined_HW_Scales_dynamic + + print(f"\nSupported resolutions for schedule_mode='{vae.quantizer.schedule_mode}':") + if vae.quantizer.schedule_mode == "dynamic": + supported_resolutions = sorted(list(predefined_HW_Scales_dynamic.keys())) + print(f" {len(supported_resolutions)} resolutions:") + for res in supported_resolutions: + print(f" - {res}") + elif vae.quantizer.schedule_mode == "original": + # From get_latent2scale_schedule function + supported_resolutions = [(16, 16), (36, 64), (18, 32), (30, 53), (32, 32), (64, 64)] + print(f" {len(supported_resolutions)} resolutions:") + for res in supported_resolutions: + print(f" - {res}") + else: + print(f" (Please check quantizer code for mode '{vae.quantizer.schedule_mode}')") + supported_resolutions = [] + + # Check if current resolution is supported + is_supported = False + if vae.quantizer.schedule_mode == "dynamic": + is_supported = (H, W) in predefined_HW_Scales_dynamic + elif vae.quantizer.schedule_mode == "original": + is_supported = (H, W) in [(16, 16), (36, 64), (18, 32), (30, 53), (32, 32), (64, 64)] + + if not is_supported: + print(f"\n❌ ERROR: Resolution ({H}, {W}) is NOT supported for schedule_mode='{vae.quantizer.schedule_mode}'") + print(f" Please use one of the supported resolutions listed above.") + print(f" Or change the video resolution to match a supported one.") + print(f"\n To fix this, you can:") + print(f" 1. Change video resolution to one of: {supported_resolutions[:5]}...") + print(f" 2. Or manually add ({H}, {W}) to predefined_HW_Scales_dynamic") + raise ValueError(f"Resolution ({H}, {W}) not supported for schedule_mode='{vae.quantizer.schedule_mode}'. " + f"Supported resolutions: {supported_resolutions}") + + print(f"\n✓ Resolution ({H}, {W}) is supported!") + print("Quantizing to get discrete codes (indices)...") + print(" Note: Fixed tower_split_index bug in quantizer for non-infinity_video_two_pyramid modes.") + + try: + # Pass tensor directly (not as list) + # The quantizer forward method has been fixed to initialize tower_split_index for non-infinity_video_two_pyramid modes + result = vae.quantizer(raw_features) + # The quantizer returns: (quantized_out, all_indices, all_bit_indices, residual_norm_per_scale, all_losses, var_inputs) + if isinstance(result, (list, tuple)) and len(result) >= 2: + quantized_out, all_indices, all_bit_indices, residual_norm_per_scale, all_losses, var_inputs = result[:6] + else: + raise ValueError(f"Unexpected return format from quantizer: {type(result)}, length: {len(result) if isinstance(result, (list, tuple)) else 'N/A'}") + + # quantized_out is already the final quantized latent tensor, not a list + quantized_latent = quantized_out + + # Extract discrete indices (all_indices is a list of index tensors) + if isinstance(all_indices, (list, tuple)) and len(all_indices) > 0: + discrete_indices = all_indices[0] # Use first scale's indices for display + else: + discrete_indices = all_indices + + if discrete_indices is not None: + print(f"✓ Quantization successful!") + print(f" Discrete indices shape: {discrete_indices.shape}") + print(f" Discrete indices dtype: {discrete_indices.dtype}") + print(f" Discrete indices range: [{discrete_indices.min().item()}, {discrete_indices.max().item()}]") + unique_count = torch.unique(discrete_indices).numel() + print(f" Discrete indices unique values: {unique_count} (codebook size)") + + print(f" Quantized latent shape: {quantized_latent.shape}") + print(f" Quantized latent range: [{quantized_latent.min():.3f}, {quantized_latent.max():.3f}]") + + latent_to_decode = quantized_latent + use_quantized = True + except Exception as e: + import traceback + print(f"\n❌ ERROR: Quantization failed!") + print(f" Error: {e}") + print(f" Error type: {type(e).__name__}") + print(f"\n Full traceback:") + print(traceback.format_exc()) + raise RuntimeError(f"Quantization failed: {e}. This is required for testing quantizer performance.") from e + else: + print(" No quantizer found, using continuous latent (VAE mode, not VQ-VAE).") + latent_to_decode = raw_features + use_quantized = False + discrete_indices = None + + print("\n" + "=" * 80) + print("Decoding using vae.decode (InfinityStar's method)...") + if use_quantized: + print(" Using quantized latent (VQ-VAE path with discrete codes)") + else: + print(" Using continuous latent (VAE path, no quantization)") + print("=" * 80) + + with torch.no_grad(): + # Use InfinityStar's decode + reconstructed = vae.decode(latent_to_decode, slice=True) + if isinstance(reconstructed, tuple): + reconstructed = reconstructed[0] + + # Clamp like in InfinityStar's code + reconstructed = torch.clamp(reconstructed, min=-1, max=1) + + print(f"Reconstructed shape: {reconstructed.shape}") + print(f"Reconstructed range: [{reconstructed.min():.3f}, {reconstructed.max():.3f}]") + + # Convert to [0, 1] for visualization + original_01 = (video_bcthw + 1.0) / 2.0 + reconstructed_01 = (reconstructed + 1.0) / 2.0 + original_01 = torch.clamp(original_01, 0, 1) + reconstructed_01 = torch.clamp(reconstructed_01, 0, 1) + + # Convert from [B, C, T, H, W] to [C, T, H, W] for grid creation + original_01_video = original_01.squeeze(0) # [C, T, H, W] + reconstructed_01_video = reconstructed_01.squeeze(0) # [C, T, H, W] + + # Save comparison + output_dir = "vae_reconstruction_test" + os.makedirs(output_dir, exist_ok=True) + + print("\n" + "=" * 80) + print("Creating comparison grid (same format as test_cosmos_vqvae.py)...") + print("=" * 80) + + # Create comparison grid (same as test_cosmos_vqvae.py) + grid_output_path = os.path.join(output_dir, "comparison_grid.png") + create_comparison_grid(original_01_video, reconstructed_01_video, grid_output_path, nrow=4) + + # Save as video (keep the video saving logic) + print("\nSaving comparison video...") + video_frames = [] + for i in range(min(original_01.shape[2], reconstructed_01.shape[2])): + orig_frame = original_01[0, :, i, :, :].permute(1, 2, 0).cpu().numpy() + recon_frame = reconstructed_01[0, :, i, :, :].permute(1, 2, 0).cpu().numpy() + + orig_frame = (orig_frame * 255).astype(np.uint8) + recon_frame = (recon_frame * 255).astype(np.uint8) + + # Ensure it's writable and contiguous + side_by_side = np.hstack([orig_frame, recon_frame]).copy() + side_by_side = np.ascontiguousarray(side_by_side) + cv2.putText(side_by_side, "Original", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) + cv2.putText(side_by_side, "Reconstructed", (tgt_w + 10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2) + + video_frames.append(cv2.cvtColor(side_by_side, cv2.COLOR_RGB2BGR)) + + video_path_out = os.path.join(output_dir, "comparison.mp4") + imageio.mimsave(video_path_out, video_frames, fps=8) + print(f"Saved video: {video_path_out}") + + print("\n" + "=" * 80) + print("Test complete!") + print(f"Results saved to: {output_dir}") + print(f" - Comparison grid: {grid_output_path}") + print(f" - Comparison video: {video_path_out}") + print("=" * 80) + +if __name__ == "__main__": + main() + diff --git a/Meissonic/InfinityStar/tools/__init__.py b/Meissonic/InfinityStar/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68af73834914377d02a72e2a8b5c04781718ac --- /dev/null +++ b/Meissonic/InfinityStar/tools/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT \ No newline at end of file diff --git a/Meissonic/InfinityStar/tools/__pycache__/__init__.cpython-310.pyc b/Meissonic/InfinityStar/tools/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d01c6638a67a15fb77f9abbbbd2e3e20781d8705 Binary files /dev/null and b/Meissonic/InfinityStar/tools/__pycache__/__init__.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/tools/__pycache__/run_infinity.cpython-310.pyc b/Meissonic/InfinityStar/tools/__pycache__/run_infinity.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d3590baa0c1b0c1d4e329dd1a8139bd812c52bf Binary files /dev/null and b/Meissonic/InfinityStar/tools/__pycache__/run_infinity.cpython-310.pyc differ diff --git a/Meissonic/InfinityStar/tools/infer_interact_480p.py b/Meissonic/InfinityStar/tools/infer_interact_480p.py new file mode 100644 index 0000000000000000000000000000000000000000..493ab68b5d9384428e01ee2cce597b3f6ef8a505 --- /dev/null +++ b/Meissonic/InfinityStar/tools/infer_interact_480p.py @@ -0,0 +1,239 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import sys +import json +import argparse +import os +import os.path as osp +import sys +sys.path.append(osp.dirname(osp.dirname(__file__))) + +import cv2 +import torch +import random +import shutil +import numpy as np + +from tools.run_infinity import * +from infinity.utils.video_decoder import EncodedVideoOpencv +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta, get_first_full_spatial_size_scale_index +from infinity.schedules import get_encode_decode_func + + +def tensor2images(tensor): + """Convert [bs,3,t,h,w] tensor to list of np.uint8 images + """ + tensor = (tensor + 1) / 2 + tensor = torch.clamp(tensor, 0, 1) + tensor = tensor.permute(0,2,3,4,1) # [bs, 3, t, h, w] -> [bs, t, h, w, 3] + tensor = tensor.mul_(255).to(torch.uint8).flip(dims=(4,)) + tensor = tensor.cpu().numpy() + return tensor + +if __name__ == '__main__': + args=argparse.Namespace( + pn='0.40M', + fps=16, + model_type='infinity_qwen8b', + h_div_w_template=1.000, + cache_dir='/dev/shm', + seed=0, + bf16=0, + temporal_slice=0, + enable_model_cache=0, + scale_embeds_num=128, + train_h_div_w_list=[0.571, 1.0], + steps_per_frame=3, + context_frames=1000, + image_batch_size=1, + video_batch_size=1, + down_size_limit=340, + casual_multi_scale=0, + noise_apply_layers=200, + noise_apply_requant=1, + noise_apply_strength=[0. for _ in range(100)], + video_caption_type='tarsier2_caption', + temporal_compress_rate=4, + cached_video_frames=81, + learn_residual=0, + use_diffloss=0, + diffusion_batch_mul=0, + video_fps=16, + power_value=1.0, + noise_apply_random_one=0, + inject_sync=0, + scales_256=11, + dummy_text_len_in_seq=0, + scale_max_token_len=-1, + same_batch_among_ranks=0, + use_flex_attn=0, + rope2d_each_sa_layer=1, + rope2d_normalized_by_hw=2, + sampling_per_bits=1, + ) + + checkpoints_dir='./' + args.model_path=os.path.join(checkpoints_dir, 'InfinityStarInteract_24K_iters') + args.vae_path=os.path.join(checkpoints_dir, 'infinitystar_videovae.pth') + args.text_encoder_ckpt=os.path.join(checkpoints_dir, 'text_encoder/flan-t5-xl-official/') + args.checkpoint_type='torch_shard' + + + args.set_motion_score = -1 + args.min_scale_ind=3 + args.loop_times_per_scale=1 + args.global_sid_pe=0 + args.h_div_w = 0.571 + args.input_noise=1 + args.use_cfg, args.use_apg, args.apg_norm_threshold = 1, 0, 0.15 + args.diffusion_steps=-1 + args.infinity_diffusion_sample_topk=1 + args.noise_input=0 + args.reduce_accumulate_error_method='bsc' + args.map_to_wide_weights=0 + args.min_duration=-1 + args.use_space_time_quant=0 + args.use_learnable_dim_proj=0 + args.semantic_scale_dim=16 + args.detail_scale_dim=64 + args.use_prompt_engineering = False + args.context_from_largest_no=1 + args.max_repeat_times=1000 + args.text_channels=2048 + args.dynamic_scale_schedule='infinity_star_interact' + args.mask_type='infinity_star_interact' + args.semantic_scales=11 + args.detail_scale_min_tokens=350 + args.video_frames=161 + args.max_duration=10 + args.videovae=10 + args.vae_type=64 + args.num_lvl=2 + args.num_of_label_value=args.num_lvl + args.semantic_num_lvl=args.num_lvl + args.semantic_scale_dim=16 + args.detail_num_lvl=args.num_lvl + args.detail_scale_dim=64 + args.use_clipwise_caption=1 + args.use_prompt_engineering = False + args.vae_detail='discrete_flow_vae' + args.use_feat_proj=2 + args.use_fsq_cls_head=0 + args.rope_type = '4d' + args.noise_apply_strength = 0.0 + args.task_type='t2v' + args.inner_scale_boost=0 + args.append_duration2caption=1 + args.n_sampes=1 + args.duration_resolution=1 + args.frames_inner_clip=20 + args.image_scale_repetition = '[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1]' + args.video_scale_repetition = args.image_scale_repetition + args.taui, args.tauv = 0.5, 0.5 + args.use_cfg, args.use_apg, args.cfg, args.apg_norm_threshold = 1, 0, 3, 0.05 + args.tau = [args.taui] * len(json.loads(args.image_scale_repetition)) + [args.tauv] * len(json.loads(args.video_scale_repetition)) + args.context_interval=2 + args.simple_text_proj=1 + args.apply_spatial_patchify=0 + args.use_two_stage_lfq=1 + args.fsdp_save_flatten_model=1 + args.two_gpu_infer=False + + scale_repetition = '' + gt_leak = -1 + quality_prompt = '' + + video_encode, video_decode, get_visual_rope_embeds, get_scale_pack_info = get_encode_decode_func(args.dynamic_scale_schedule) + total_secs = (args.video_frames-1) / args.fps + if args.two_gpu_infer: + args.other_device = 'cuda:1' + else: + args.other_device = 'cuda' + + # load text encoder + text_tokenizer, text_encoder = load_tokenizer(t5_path=args.text_encoder_ckpt) + # load vae + vae = load_visual_tokenizer(args) + # load infinity + infinity = load_transformer(vae, args) + + save_dir_root = osp.join('tmp_videos', osp.basename(osp.dirname(args.model_path)), osp.basename(args.model_path)) + save_name = f'pn{args.pn}_fps{args.fps}_elegant_overfit100_rep_vf{args.video_frames}_cinterval_{args.context_interval}_use_cfg_{args.cfg}_use_apg_{args.use_apg}_cfg{args.cfg}_apg_norm_thre_{args.apg_norm_threshold}_taui{args.taui:.1f}_tauv{args.tauv:.1f}_gt_leak_{gt_leak}' + save_dir_root = osp.join(save_dir_root, save_name) + if osp.exists(save_dir_root): + shutil.rmtree(save_dir_root) + + print(args) + dynamic_resolution_h_w, h_div_w_templates = get_dynamic_resolution_meta(args.dynamic_scale_schedule, args.video_frames) + h_div_w_template_list = np.array(list(dynamic_resolution_h_w.keys())) + + test_data_dir = 'data/interactive_toy_videos' + for dir_ind, story_id in enumerate(os.listdir(test_data_dir)): + story_dir = osp.join(test_data_dir, story_id) + prompt_path = osp.join(story_dir, 'prompt.txt') + with open(prompt_path, 'r') as f: + prompts = f.readlines() + prompts = [f'<<>>{item.strip()}' for item in prompts] + first_frame_features = None + for ind, prompt in enumerate(prompts): + save_dir = osp.join(save_dir_root, f'{dir_ind:04d}_{story_id}') + if ind == 0: + mode = 'first_iv_clip' + video = EncodedVideoOpencv(osp.join(story_dir, '0000_refine_720p.mp4'), '0000_refine_720p.mp4', num_threads=0) + raw_video, _ = video.get_clip(video.duration-5, video.duration, 81) + h, w, _ = raw_video[0].shape + h_div_w_template_ = h_div_w_template_list[np.argmin(np.abs(h/w-h_div_w_template_list))] + scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['pt2scale_schedule'][21] + vae_stride = 16 + tgt_h, tgt_w = scale_schedule[-1][1] * vae_stride, scale_schedule[-1][2] * vae_stride + img_T3HW = [transform(Image.fromarray(frame[:,:,::-1]), tgt_h, tgt_w) for frame in raw_video] + img_T3HW = torch.stack(img_T3HW, 0) # [t,3,h,w] + img_bcthw = img_T3HW.permute(1,0,2,3).unsqueeze(0).to('cuda') # [c,t,h,w] -> [b,c,t,h,w] + args.first_full_spatial_size_scale_index = get_first_full_spatial_size_scale_index(scale_schedule) + args.tower_split_index = args.first_full_spatial_size_scale_index + 1 + scales_in_one_clip = args.first_full_spatial_size_scale_index + 1 + cur_scale_schedule = scale_schedule[scales_in_one_clip:] + context_info = get_scale_pack_info(cur_scale_schedule, args.first_full_spatial_size_scale_index, args) + former_clip_features, _, _ = vae.encode_for_raw_features(img_bcthw, scale_schedule=None, slice=True) + # recons first frame + recons_video = vae.decode(former_clip_features, slice=True) + recons_video = tensor2images(recons_video) + ref_video_path = osp.join(save_dir, f"{ind:04d}.mp4") + save_video(recons_video[0], fps=args.fps, save_filepath=ref_video_path) + if first_frame_features is None: + first_frame_features = former_clip_features[:,:,0:1] + raw_video = np.array([cv2.resize(img, (tgt_w, tgt_h)) for img in raw_video]) + ref_video_path = osp.join(save_dir, f"{ind:04d}_gt.mp4") + save_video(raw_video, fps=args.fps, save_filepath=ref_video_path) + shutil.copyfile(prompt_path, osp.join(save_dir, f"prompt.txt")) + else: + mode = 'second_v_clip' + video, former_clip_features = gen_one_example( + infinity, + vae, + text_tokenizer, + text_encoder, + prompt, + negative_prompt="", + g_seed=args.seed, + gt_leak=-1, + gt_ls_Bl=None, + cfg_list=args.cfg, + tau_list=args.tau, + scale_schedule=cur_scale_schedule, + vae_type=args.vae_type, + sampling_per_bits=args.sampling_per_bits, + enable_positive_prompt=False, + low_vram_mode=True, + args=args, + get_visual_rope_embeds=get_visual_rope_embeds, + context_info=context_info, + noise_list=None, + mode=mode, + former_clip_features=former_clip_features, + first_frame_features=first_frame_features, + ) + video = video.cpu().numpy() + ref_video_path = osp.join(save_dir, f"{ind:04d}.mp4") + save_video(video, fps=args.fps, save_filepath=ref_video_path) diff --git a/Meissonic/InfinityStar/tools/infer_video_480p.py b/Meissonic/InfinityStar/tools/infer_video_480p.py new file mode 100644 index 0000000000000000000000000000000000000000..6e277caabe7bd25c2eb45da5b3103cf6c58b7021 --- /dev/null +++ b/Meissonic/InfinityStar/tools/infer_video_480p.py @@ -0,0 +1,223 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import sys +import json +import os +import os.path as osp +from tqdm import tqdm +import sys +import time +import numpy as np +import torch +import cv2 +import numpy as np +import argparse +from PIL import Image +sys.path.append(osp.dirname(osp.dirname(__file__))) +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + +from tools.run_infinity import load_tokenizer, load_transformer, load_visual_tokenizer, gen_one_example, save_video, transform +from infinity.models.self_correction import SelfCorrection +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta, get_first_full_spatial_size_scale_index +from infinity.schedules import get_encode_decode_func +from infinity.utils.video_decoder import EncodedVideoDecord +from infinity.utils.arg_util import Args + + +def _init_prompt_rewriter(): + from tools.prompt_rewriter import OpenAIGPTModel + """Initialize the OpenAI GPT model.""" + # Initialize the OpenAI GPT model + model_name = 'gpt-4o-2024-08-06' + ak = os.environ.get("OPEN_API_KEY", "") + if len(ak) == 0: + raise ValueError("Please provide your OpenAI API key in the OPEN_API_KEY environment variable.") + model = OpenAIGPTModel(model_name, ak, if_global=True) + system_prompt = ( + "You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description to make the video more realistic and beautiful. 0. Preserve ALL information, including style words and technical terms. 1. If the subject is related to person, you need to provide a detailed description focusing on basic visual characteristics of the person, such as appearance, clothing, expression, posture, etc. You need to make the person as beautiful and handsome as possible. When the subject is only one person or object, do not use they to describe him/her/it to avoid confusion with multiple subjects. 2. If the input does not include style, lighting, atmosphere, you can make reasonable associations. 3. We only generate a four-second video based on your descriptions. So do not generate descriptions that are too long, too complex or contain too many activities. 4. You can add some descriptions of camera movements with regards to the scenes and allow the scenes to have very natural and coherent movements. 6. If the input is in Chinese, translate the entire description to English. 7. Output ALL must be in English. 8. Here are some expanded descriptions that can serve as examples: 1. The video begins with a distant aerial view of a winding river cutting through a rocky landscape, with the sun casting a soft glow over the scene. As the camera moves closer, the river's flow becomes more visible, and the surrounding terrain appears more defined. The camera continues to approach, revealing a steep cliff with a person sitting on its edge. The person is positioned near the top of the cliff, overlooking the river below. The camera finally reaches a close-up view, showing the person sitting calmly on the cliff, with the river and landscape fully visible in the background. 2. In a laboratory setting, a machine with a metallic structure and a green platform is seen. A small, clear plastic bottle is positioned on the green platform. The machine has a control panel with red and green lights on the right side. A nozzle is positioned above the bottle, and it begins to dispense liquid into the bottle. The liquid is dispensed in small droplets, and the nozzle moves slightly between each droplet. The background includes other laboratory equipment and a mesh-like structure. 3. The video shows a panoramic view of a cityscape with a prominent building featuring a green dome and ornate architecture in the center. Surrounding the main building are several other structures, including a white building with balconies on the left and a taller building with multiple windows on the right. In the background, there are hills with scattered buildings and greenery. The camera remains stationary, capturing the scene from a fixed position, with no noticeable changes in the environment or the buildings throughout the frames. 4. In a dimly lit room with red and blue lighting, a person holds up a smartphone to record a video of a band performing. The band members are seated, with one holding a guitar and another playing a double bass. The smartphone screen shows the band members being recorded, with the camera capturing their movements and expressions. The background includes a lamp and some furniture, adding to the cozy atmosphere of the scene. 5. In a grassy area with scattered trees, a large tree stands prominently in the center. A lion is perched on a thick branch of this tree, looking out into the distance. The sky is overcast, adding a somber tone to the scene. 6. A man in a green sweater holding a paper turns around and speaks to a group of people seated in a theater. He then points at a man in a yellow sweater sitting in the front row. The man in the yellow sweater looks at the paper in his hand and begins to speak. The man in the green sweater lowers his head and then looks up at the man in the yellow sweater again. 7. An elderly man, wearing a beige sweater over a yellow shirt, is sitting in front of a laptop. He holds a pair of glasses in his right hand and appears to be deep in thought, resting his head on his hand. He then raises the glasses and rubs his eyes with his fingers, showing signs of fatigue. After rubbing his eyes, he places the glasses on his sweater and looks down at the laptop screen. 8. A woman and a child are sitting at a table, each holding a pencil and coloring on a piece of paper. The woman is coloring a green leafy plant, while the child is coloring a red and blue object. The table has several colored pencils, a container filled with more pencils, and a few small colorful blocks. The woman is wearing a striped shirt, and the child is focused on their drawing. 9. A person wearing teal running shoes and colorful socks is running on a wet, sandy surface. The camera captures the movement of their legs and feet as they lift off the ground and land back, creating a clear shadow on the wet sand. The shadow elongates and shifts with each step, indicating the person's motion. The background remains consistent with the wet, textured sand, and the focus is solely on the runner's feet and their shadow. 10. A man is running along the shoreline of a beach, with the ocean waves gently crashing onto the shore. The sun is setting in the background, casting a warm glow over the scene. The man is wearing a light-colored jacket and shorts, and his hair is blowing in the wind as he runs. The water splashes around his legs as he moves forward, and his reflection is visible on the wet sand. The waves create a dynamic and lively atmosphere as they roll in and out." + ) + gpt_model = OpenAIGPTModel(model_name, ak, if_global=True) + return gpt_model, system_prompt + +class InferencePipe: + def __init__(self, args): + # load text encoder + self.text_tokenizer, self.text_encoder = load_tokenizer(t5_path=args.text_encoder_ckpt) + # load vae + self.vae = load_visual_tokenizer(args) + self.vae = self.vae.float().to('cuda') + # load infinity + self.infinity = load_transformer(self.vae, args) + self.self_correction = SelfCorrection(self.vae, args) + + self._models = [self.text_tokenizer, self.text_encoder, self.vae, self.infinity, self.self_correction] + + self.video_encode, self.video_decode, self.get_visual_rope_embeds, self.get_scale_pack_info = get_encode_decode_func(args.dynamic_scale_schedule) + + if args.enable_rewriter: + self.gpt_model, self.system_prompt = _init_prompt_rewriter() + + +def perform_inference(pipe, data, args): + + prompt = data["prompt"] + seed = data["seed"] + mapped_duration=data['duration'] + num_frames=mapped_duration*16+1 + + # If an image_path is provided, perform image-to-video generation. + image_path = data.get("image_path", None) + video_path = data.get("video_path", None) + + + dynamic_resolution_h_w, h_div_w_templates = get_dynamic_resolution_meta(args.dynamic_scale_schedule, args.video_frames) + h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates-0.571))] + scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['pt2scale_schedule'][(num_frames-1)//4+1] + args.first_full_spatial_size_scale_index = get_first_full_spatial_size_scale_index(scale_schedule) + args.tower_split_index = args.first_full_spatial_size_scale_index + 1 + context_info = pipe.get_scale_pack_info(scale_schedule, args.first_full_spatial_size_scale_index, args) + scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['pt2scale_schedule'][(num_frames-1)//4+1] + tau = [args.tau_image] * args.tower_split_index + [args.tau_video] * (len(scale_schedule) - args.tower_split_index) + tgt_h, tgt_w = scale_schedule[-1][1] * 16, scale_schedule[-1][2] * 16 + gt_leak, gt_ls_Bl = -1, None + + if video_path is not None: + # Video continuation + video = EncodedVideoDecord(video_path, os.path.basename(video_path), num_threads=0) + duration = video._duration + if mapped_duration != 10: + raise ValueError('Video continuation only support 10 seconds generation.') + if duration < 5: + raise ValueError('Input video duration must be longer than 5 seconds.') + condition_scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['pt2scale_schedule'][(81-1)//4+1] + cond_tgt_h, cond_tgt_w = condition_scale_schedule[-1][1] * 16, condition_scale_schedule[-1][2] * 16 + raw_video, _ = video.get_clip(0, 5, 81) + video_T3HW = [transform(Image.fromarray(frame).convert("RGB"), cond_tgt_h, cond_tgt_w) for frame in raw_video] + video_T3HW = torch.stack(video_T3HW, 0) # [t,3,h,w] + video_bcthw = video_T3HW.permute(1,0,2,3).unsqueeze(0) # [c,t,h,w] -> [b,c,t,h,w] + _, _, gt_ls_Bl, _, _, _ = pipe.video_encode(pipe.vae, video_bcthw.cuda(), vae_features=None, self_correction=pipe.self_correction, args=args, infer_mode=True, dynamic_resolution_h_w=dynamic_resolution_h_w) + gt_leak=28 + elif image_path is not None: + # Image to Video + ref_image = [cv2.imread(image_path)[:,:,::-1]] + ref_img_T3HW = [transform(Image.fromarray(frame).convert("RGB"), tgt_h, tgt_w) for frame in ref_image] + ref_img_T3HW = torch.stack(ref_img_T3HW, 0) # [t,3,h,w] + ref_img_bcthw = ref_img_T3HW.permute(1,0,2,3).unsqueeze(0) # [c,t,h,w] -> [b,c,t,h,w] + _, _, gt_ls_Bl, _, _, _ = pipe.video_encode(pipe.vae, ref_img_bcthw.cuda(), vae_features=None, self_correction=pipe.self_correction, args=args, infer_mode=True, dynamic_resolution_h_w=dynamic_resolution_h_w) + gt_leak=14 + + generated_image_list = [] + negative_prompt='' + prompt = f'{prompt}, Close-up on big objects, emphasize scale and detail' + negative_prompt = "" + if args.append_duration2caption: + prompt = f'<<>>' + prompt + + start_time = time.time() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True), torch.no_grad(): + generated_image, _ = gen_one_example( + pipe.infinity, + pipe.vae, + pipe.text_tokenizer, + pipe.text_encoder, + prompt, + negative_prompt=negative_prompt, + g_seed=seed, + gt_leak=gt_leak, + gt_ls_Bl=gt_ls_Bl, + cfg_list=args.cfg, + tau_list=tau, + scale_schedule=scale_schedule, + cfg_insertion_layer=[0], + vae_type=args.vae_type, + sampling_per_bits=1, + enable_positive_prompt=0, + low_vram_mode=True, + args=args, + get_visual_rope_embeds=pipe.get_visual_rope_embeds, + context_info=context_info, + noise_list=None, + ) + if len(generated_image.shape) == 3: + generated_image = generated_image.unsqueeze(0) + print(generated_image.shape) + generated_image_list.append(generated_image) + + generated_image = torch.cat(generated_image_list, 2) + end_time = time.time() + elapsed_time = end_time - start_time + + return { + "output": generated_image.cpu().numpy(), + "elapsed_time": elapsed_time, + } + + +if __name__ == '__main__': + generation_duration = 5 # or 10 + checkpoints_dir = './' + # For optimal performance, enabling the prompt rewriter is recommended. + # To utilize the GPT model, ensure the following environment variables are set: + # export OPEN_API_KEY="YOUR_API_KEY" + # export GLOBAL_AZURE_ENDPOINT="YOUR_ENDPOINT" + enable_rewriter=0 + + # infer args + args = Args() + args.pn='0.40M' + args.fps=16 + args.video_frames=generation_duration * 16 + 1 + args.model_path=os.path.join(checkpoints_dir, 'infinitystar_8b_480p_weights') + args.checkpoint_type='torch_shard' # omnistore + args.vae_path=os.path.join(checkpoints_dir, 'infinitystar_videovae.pth') + args.text_encoder_ckpt=os.path.join(checkpoints_dir, 'text_encoder/flan-t5-xl-official/') + args.videovae=10 + args.model_type='infinity_qwen8b' + args.text_channels=2048 + args.dynamic_scale_schedule='infinity_elegant_clip20frames_v2' + args.bf16=1 + args.use_apg=1 + args.use_cfg=0 + args.cfg=34 + args.tau_image = 1 + args.tau_video = 0.4 + args.apg_norm_threshold=0.05 + args.image_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]' + args.video_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1]' + args.append_duration2caption=1 + args.use_two_stage_lfq=1 + args.detail_scale_min_tokens=350 + args.semantic_scales=11 + args.max_repeat_times=10000 + args.enable_rewriter=enable_rewriter + + # load models + pipe = InferencePipe(args) + + prompt = "A handsome smiling gardener inspecting plants, realistic cinematic lighting, detailed textures, ultra-realistic" + image_path = 'assets/reference_image.webp' # Remove this for Text-to-Video (T2V) generation + data = { + 'seed': 41, + 'image_path': image_path, + 'prompt': prompt, + 'duration': generation_duration, + } + if args.enable_rewriter: + # Step 1: Rewrite the prompt using GPT + # rewritten_prompt = prompt + rewritten_prompt = pipe.gpt_model( + prompt = ("Rewrite the following video descriptions, add more details of the subject and the camera movement to enhance the quality of the video. Do not use the word 'they' to refer to a single person or object. Concatenate all sentences together, not present them in paragraphs. Please rewrite with concise and clear language: " + + prompt), + system_prompt=pipe.system_prompt, + ) + print(f"Rewritten prompt: {rewritten_prompt}") + prompt = rewritten_prompt + data['prompt'] = prompt + + output_dict = perform_inference(pipe, data, args) + save_dir = 'output' + gen_video_path = osp.join(os.path.join(save_dir, 'gen_videos'), f'demo.mp4') + save_video(output_dict['output'], fps=args.fps, save_filepath=gen_video_path) + + print(f"Video genernation done: {gen_video_path=}") diff --git a/Meissonic/InfinityStar/tools/infer_video_720p.py b/Meissonic/InfinityStar/tools/infer_video_720p.py new file mode 100644 index 0000000000000000000000000000000000000000..7d9968e958f6c6e255458189aa7b6eb72861e100 --- /dev/null +++ b/Meissonic/InfinityStar/tools/infer_video_720p.py @@ -0,0 +1,201 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import sys +import json +import os +import os.path as osp +from tqdm import tqdm +import sys +import time +import numpy as np +import torch +import cv2 +import numpy as np +import argparse +from PIL import Image +sys.path.append(osp.dirname(osp.dirname(__file__))) +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + +from tools.run_infinity import load_tokenizer, load_transformer, load_visual_tokenizer, gen_one_example, save_video, transform +from infinity.models.self_correction import SelfCorrection +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta, get_first_full_spatial_size_scale_index +from infinity.schedules import get_encode_decode_func +from infinity.utils.arg_util import Args + + +def _init_prompt_rewriter(): + from tools.prompt_rewriter import OpenAIGPTModel + """Initialize the OpenAI GPT model.""" + # Initialize the OpenAI GPT model + model_name = 'gpt-4o-2024-08-06' + ak = os.environ.get("OPEN_API_KEY", "") + if len(ak) == 0: + raise ValueError("Please provide your OpenAI API key in the OPEN_API_KEY environment variable.") + model = OpenAIGPTModel(model_name, ak, if_global=True) + system_prompt = ( + "You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description to make the video more realistic and beautiful. 0. Preserve ALL information, including style words and technical terms. 1. If the subject is related to person, you need to provide a detailed description focusing on basic visual characteristics of the person, such as appearance, clothing, expression, posture, etc. You need to make the person as beautiful and handsome as possible. When the subject is only one person or object, do not use they to describe him/her/it to avoid confusion with multiple subjects. 2. If the input does not include style, lighting, atmosphere, you can make reasonable associations. 3. We only generate a four-second video based on your descriptions. So do not generate descriptions that are too long, too complex or contain too many activities. 4. You can add some descriptions of camera movements with regards to the scenes and allow the scenes to have very natural and coherent movements. 6. If the input is in Chinese, translate the entire description to English. 7. Output ALL must be in English. 8. Here are some expanded descriptions that can serve as examples: 1. The video begins with a distant aerial view of a winding river cutting through a rocky landscape, with the sun casting a soft glow over the scene. As the camera moves closer, the river's flow becomes more visible, and the surrounding terrain appears more defined. The camera continues to approach, revealing a steep cliff with a person sitting on its edge. The person is positioned near the top of the cliff, overlooking the river below. The camera finally reaches a close-up view, showing the person sitting calmly on the cliff, with the river and landscape fully visible in the background. 2. In a laboratory setting, a machine with a metallic structure and a green platform is seen. A small, clear plastic bottle is positioned on the green platform. The machine has a control panel with red and green lights on the right side. A nozzle is positioned above the bottle, and it begins to dispense liquid into the bottle. The liquid is dispensed in small droplets, and the nozzle moves slightly between each droplet. The background includes other laboratory equipment and a mesh-like structure. 3. The video shows a panoramic view of a cityscape with a prominent building featuring a green dome and ornate architecture in the center. Surrounding the main building are several other structures, including a white building with balconies on the left and a taller building with multiple windows on the right. In the background, there are hills with scattered buildings and greenery. The camera remains stationary, capturing the scene from a fixed position, with no noticeable changes in the environment or the buildings throughout the frames. 4. In a dimly lit room with red and blue lighting, a person holds up a smartphone to record a video of a band performing. The band members are seated, with one holding a guitar and another playing a double bass. The smartphone screen shows the band members being recorded, with the camera capturing their movements and expressions. The background includes a lamp and some furniture, adding to the cozy atmosphere of the scene. 5. In a grassy area with scattered trees, a large tree stands prominently in the center. A lion is perched on a thick branch of this tree, looking out into the distance. The sky is overcast, adding a somber tone to the scene. 6. A man in a green sweater holding a paper turns around and speaks to a group of people seated in a theater. He then points at a man in a yellow sweater sitting in the front row. The man in the yellow sweater looks at the paper in his hand and begins to speak. The man in the green sweater lowers his head and then looks up at the man in the yellow sweater again. 7. An elderly man, wearing a beige sweater over a yellow shirt, is sitting in front of a laptop. He holds a pair of glasses in his right hand and appears to be deep in thought, resting his head on his hand. He then raises the glasses and rubs his eyes with his fingers, showing signs of fatigue. After rubbing his eyes, he places the glasses on his sweater and looks down at the laptop screen. 8. A woman and a child are sitting at a table, each holding a pencil and coloring on a piece of paper. The woman is coloring a green leafy plant, while the child is coloring a red and blue object. The table has several colored pencils, a container filled with more pencils, and a few small colorful blocks. The woman is wearing a striped shirt, and the child is focused on their drawing. 9. A person wearing teal running shoes and colorful socks is running on a wet, sandy surface. The camera captures the movement of their legs and feet as they lift off the ground and land back, creating a clear shadow on the wet sand. The shadow elongates and shifts with each step, indicating the person's motion. The background remains consistent with the wet, textured sand, and the focus is solely on the runner's feet and their shadow. 10. A man is running along the shoreline of a beach, with the ocean waves gently crashing onto the shore. The sun is setting in the background, casting a warm glow over the scene. The man is wearing a light-colored jacket and shorts, and his hair is blowing in the wind as he runs. The water splashes around his legs as he moves forward, and his reflection is visible on the wet sand. The waves create a dynamic and lively atmosphere as they roll in and out." + ) + gpt_model = OpenAIGPTModel(model_name, ak, if_global=True) + return gpt_model, system_prompt + +class InferencePipe: + def __init__(self, args): + # load text encoder + self.text_tokenizer, self.text_encoder = load_tokenizer(t5_path=args.text_encoder_ckpt) + # load vae + self.vae = load_visual_tokenizer(args) + self.vae = self.vae.float().to('cuda') + # load infinity + self.infinity = load_transformer(self.vae, args) + self.self_correction = SelfCorrection(self.vae, args) + + self._models = [self.text_tokenizer, self.text_encoder, self.vae, self.infinity, self.self_correction] + + self.video_encode, self.video_decode, self.get_visual_rope_embeds, self.get_scale_pack_info = get_encode_decode_func(args.dynamic_scale_schedule) + + if args.enable_rewriter: + self.gpt_model, self.system_prompt = _init_prompt_rewriter() + + +def perform_inference(pipe, data, args): + + prompt = data["prompt"] + seed = data["seed"] + mapped_duration=5 + num_frames=81 + + # If an image_path is provided, perform image-to-video generation. + image_path = data.get("image_path", None) + + dynamic_resolution_h_w, h_div_w_templates = get_dynamic_resolution_meta(args.dynamic_scale_schedule, args.video_frames) + h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates-0.571))] + scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['pt2scale_schedule'][(num_frames-1)//4+1] + args.first_full_spatial_size_scale_index = get_first_full_spatial_size_scale_index(scale_schedule) + args.tower_split_index = args.first_full_spatial_size_scale_index + 1 + context_info = pipe.get_scale_pack_info(scale_schedule, args.first_full_spatial_size_scale_index, args) + scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['pt2scale_schedule'][(num_frames-1)//4+1] + tau = [args.tau_image] * args.tower_split_index + [args.tau_video] * (len(scale_schedule) - args.tower_split_index) + tgt_h, tgt_w = scale_schedule[-1][1] * 16, scale_schedule[-1][2] * 16 + gt_leak, gt_ls_Bl = -1, None + + if image_path is not None: + ref_image = [cv2.imread(image_path)[:,:,::-1]] + ref_img_T3HW = [transform(Image.fromarray(frame).convert("RGB"), tgt_h, tgt_w) for frame in ref_image] + ref_img_T3HW = torch.stack(ref_img_T3HW, 0) # [t,3,h,w] + ref_img_bcthw = ref_img_T3HW.permute(1,0,2,3).unsqueeze(0) # [c,t,h,w] -> [b,c,t,h,w] + _, _, gt_ls_Bl, _, _, _ = pipe.video_encode(pipe.vae, ref_img_bcthw.cuda(), vae_features=None, self_correction=pipe.self_correction, args=args, infer_mode=True, dynamic_resolution_h_w=dynamic_resolution_h_w) + gt_leak=len(scale_schedule)//2 + + generated_image_list = [] + negative_prompt='' + prompt = f'{prompt}, Close-up on big objects, emphasize scale and detail' + negative_prompt = "" + if args.append_duration2caption: + prompt = f'<<>>' + prompt + + start_time = time.time() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True), torch.no_grad(): + generated_image, _ = gen_one_example( + pipe.infinity, + pipe.vae, + pipe.text_tokenizer, + pipe.text_encoder, + prompt, + negative_prompt=negative_prompt, + g_seed=seed, + gt_leak=gt_leak, + gt_ls_Bl=gt_ls_Bl, + cfg_list=args.cfg, + tau_list=tau, + scale_schedule=scale_schedule, + cfg_insertion_layer=[0], + vae_type=args.vae_type, + sampling_per_bits=1, + enable_positive_prompt=0, + low_vram_mode=True, + args=args, + get_visual_rope_embeds=pipe.get_visual_rope_embeds, + context_info=context_info, + noise_list=None, + ) + if len(generated_image.shape) == 3: + generated_image = generated_image.unsqueeze(0) + print(generated_image.shape) + generated_image_list.append(generated_image) + + generated_image = torch.cat(generated_image_list, 2) + end_time = time.time() + elapsed_time = end_time - start_time + + return { + "output": generated_image.cpu().numpy(), + "elapsed_time": elapsed_time, + } + + +if __name__ == '__main__': + # For optimal performance, enabling the prompt rewriter is recommended. + # To utilize the GPT model, ensure the following environment variables are set: + # export OPEN_API_KEY="YOUR_API_KEY" + # export GLOBAL_AZURE_ENDPOINT="YOUR_ENDPOINT" + enable_rewriter=0 + checkpoints_dir = './' + + + # infer args + args = Args() + args.pn='0.90M' + args.fps=16 + args.video_frames=81 + args.model_path=os.path.join(checkpoints_dir, 'infinitystar_8b_720p_weights') + args.checkpoint_type='torch_shard' # omnistore + args.vae_path=os.path.join(checkpoints_dir, 'infinitystar_videovae.pth') + args.text_encoder_ckpt=os.path.join(checkpoints_dir, 'text_encoder/flan-t5-xl-official/') + args.model_type='infinity_qwen8b' + args.text_channels=2048 + args.dynamic_scale_schedule='infinity_elegant_clip20frames_v2' + args.bf16=1 + args.use_apg=1 + args.use_cfg=0 + args.cfg=34 + args.tau_image = 1 + args.tau_video = 0.4 + args.apg_norm_threshold=0.05 + args.image_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]' + args.video_scale_repetition='[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1]' + args.append_duration2caption=1 + args.use_two_stage_lfq=1 + args.detail_scale_min_tokens=750 + args.semantic_scales=12 + args.max_repeat_times=10000 + args.enable_rewriter=enable_rewriter + + # load models + pipe = InferencePipe(args) + + + prompt = "A handsome smiling gardener inspecting plants, realistic cinematic lighting, detailed textures, ultra-realistic" + image_path = 'assets/reference_image.webp' # Remove this for Text-to-Video (T2V) generation + data = { + 'seed': 41, + 'image_path': image_path, + 'prompt': prompt, + } + if args.enable_rewriter: + # Step 1: Rewrite the prompt using GPT + # rewritten_prompt = prompt + rewritten_prompt = pipe.gpt_model( + prompt = ("Rewrite the following video descriptions, add more details of the subject and the camera movement to enhance the quality of the video. Do not use the word 'they' to refer to a single person or object. Concatenate all sentences together, not present them in paragraphs. Please rewrite with concise and clear language: " + + prompt), + system_prompt=pipe.system_prompt, + ) + print(f"Rewritten prompt: {rewritten_prompt}") + prompt = rewritten_prompt + data['prompt'] = prompt + + output_dict = perform_inference(pipe, data, args) + save_dir = 'output' + gen_video_path = osp.join(os.path.join(save_dir, 'gen_videos'), f'demo.mp4') + save_video(output_dict['output'], fps=args.fps, save_filepath=gen_video_path) + + print(f"Video genernation done: {gen_video_path=}") diff --git a/Meissonic/InfinityStar/tools/prompt_rewriter.py b/Meissonic/InfinityStar/tools/prompt_rewriter.py new file mode 100644 index 0000000000000000000000000000000000000000..1d10585c95aec5d7625ac742204b3e4aa649b473 --- /dev/null +++ b/Meissonic/InfinityStar/tools/prompt_rewriter.py @@ -0,0 +1,227 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT + +import os +import random +import time +from threading import Lock +from abc import abstractmethod +from typing import Any, List, Union, Optional + +import numpy as np + +import openai + +# from auto_caption.models.base_model import Model +OPEN_API_KEY = os.environ.get('OPEN_API_KEY') +GLOBAL_AZURE_ENDPOINT = os.environ.get('GLOBAL_AZURE_ENDPOINT') + +class SingletonArgMeta(type): + """ + This is a thread-safe implementation of Singleton. + """ + + _instances = {} + + _lock: Lock = Lock() + """ + We now have a lock object that will be used to synchronize threads during + first access to the Singleton. + """ + + def __call__(cls, *args, **kwargs): + """ + changes to the value of the `__init__` argument do affect + the returned instance. + """ + # Now, imagine that the program has just been launched. Since there's no + # Singleton instance yet, multiple threads can simultaneously pass the + # previous conditional and reach this point almost at the same time. The + # first of them will acquire lock and will proceed further, while the + # rest will wait here. + with cls._lock: + # The first thread to acquire the lock, reaches this conditional, + # goes inside and creates the Singleton instance. Once it leaves the + # lock block, a thread that might have been waiting for the lock + # release may then enter this section. But since the Singleton field with + # specific arguments is already initialized, the thread won't create a new object. + if cls.__name__+str(args)+str(kwargs) not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls.__name__+str(args)+str(kwargs)] = instance + return cls._instances[cls.__name__+str(args)+str(kwargs)] + + +class Model(metaclass=SingletonArgMeta): + """an abstrct model""" + + def __init__(self, model_name: Union[str, List[str]], ak: Union[str, List[str]], token_stat_percent: Optional[float] = None) -> None: + self.clients = self._init_clients(model_name, ak) + if token_stat_percent is not None: + self._init_token_stat(token_stat_percent) + + def _init_token_stat(self, token_stat_percent): + self.token_stat_percent = token_stat_percent + self.token_sort = [] + self.token_stat = {'max_token': 0, 'mean_token': 0, + 'count': 0, f'p{token_stat_percent*100}_token_num': 0} + self.token_stat_percent = token_stat_percent + + def _init_clients(self, model_name, ak): + if not isinstance(model_name, list): + model_name = [model_name] + if not isinstance(ak, list): + ak = [ak] + clients = [] + if len(ak) > 1 and len(model_name) == 1: + model_name = model_name*len(ak) + elif len(ak) == 1 and len(model_name) > 1: + ak = ak*len(model_name) + + assert len(ak) == len( + model_name), f"length of ak = {len(ak)} != length of model_name = {len(model_name)}" + for model, ak in zip(model_name, ak): + client = self._creat_client(model, ak) + clients.append(client) + print(f"init {len(clients)} clients!!!") + return clients + + def _update(self, token_num): + self.token_sort.append(token_num) + self.token_stat[f'p{self.token_stat_percent*100}_token_num'] = round( + np.percentile(self.token_sort, self.token_stat_percent*100), 2) + self.token_stat['count'] = len(self.token_sort) + self.token_stat['mean_token'] = round(np.mean(self.token_sort), 2) + self.token_stat['max_token'] = np.max(self.token_sort) + + @abstractmethod + def _creat_client(self, *args: Any, **kwds: Any) -> Any: + raise NotImplementedError + + @abstractmethod + def __call__(self, *args: Any, **kwds: Any) -> Any: + raise NotImplementedError + + +class OpenAIGPTModel(Model): + _global_azure_endpoint = GLOBAL_AZURE_ENDPOINT + _api_version = "2023-05-15" + + def __init__(self, model_name='gpt-4', ak='', log_prob=0.01, if_global=False, token_stat_percent=0.99) -> None: + self.ak = OPEN_API_KEY if not ak else ak + self.if_global = if_global + self.ak_state = {} + self.ak_state_succ = {} + self.log_prob = log_prob + self.start_time = time.time() + super().__init__(model_name, ak, token_stat_percent) + + def _creat_client(self, model_name, ak): + client = openai.AzureOpenAI( + azure_endpoint=OpenAIGPTModel._global_azure_endpoint, + api_version=OpenAIGPTModel._api_version, + api_key=ak, + ) + client.temp_model_name = model_name + client.temp_ak = ak + self.ak_state[ak[:5]] = 0 + self.ak_state_succ[ak[:5]] = 0 + return client + + def __call__(self, prompt="hello", system_prompt=None, max_tokens=1000, return_output_token_length=False): + client = random.choice(self.clients) + ak = client.temp_ak + self.ak_state[ak[:5]] += 1 + if random.random() < self.log_prob: + for ak in self.ak_state: + print( + f"ak: {ak} 请求数:{self.ak_state[ak]}, 成功数:{self.ak_state_succ[ak]}, 成功率:{self.ak_state_succ[ak]/self.ak_state[ak]*100:.2f}%, 速度:{self.ak_state_succ[ak]/(time.time()-self.start_time)*60:.2f} 个/分钟") + print(f"token_stat:{self.token_stat}") + + messages = [] + if system_prompt is not None: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + completion = client.chat.completions.create( + extra_headers={"X-TT-LOGID": "lizhe.xyz"}, # 请务必带上此header,方便定位问题 + model=client.temp_model_name, + messages=messages, + max_tokens=max_tokens + ) + self.ak_state_succ[ak[:5]] += 1 + if self.token_stat_percent is not None: + # 输出token统计 + self._update(completion.usage.completion_tokens) + + if return_output_token_length: + # 允许输出token长度,便于筛选因输出达到max_tokens而被截断的数据 + return completion.choices[0].message.content, completion.usage.completion_tokens + return completion.choices[0].message.content + + +# better prompt +system_prompt = '''You are a prompt engineer, aiming to rewrite user inputs into high-quality prompts for better video generation without affecting the original meaning.\n''' \ +'''Task requirements:\n''' \ +'''1. If no subject appearance description provided in user inputs, add some descriptions about appearance (e.g., clothing, gender);\n''' \ +'''2. If there is no camera movement information in the user inputs and the entire scene is very static, you can appropriately add some simple camera movement descriptions;\n''' \ +'''3. If there is no description of the setting, add some visual details about the scene, but there should not be a change of the scene;\n''' \ +'''4. If there is no description of shot scale in the user's input, you can appropriately add some information about the shot scale;\n''' \ +'''5. If the character's actions take too little time, you can appropriately add some detailed actions during or after the original action so that the whole prompt described can last for 5 seconds in total. The newly added actions need to be dynamic rather than static (for example, looking into the distance or standing still). Add the main subject's actions in chronological order, and include detailed descriptions of their physical movements;\n''' \ +'''6. If you are describing an action, describe the entire process, for example, "pick up the clothes, put your hands into the sleeves, and put on the clothes.";\n''' \ +'''7. The word count for each action description should be evenly distributed throughout the entire output, and there should not be any action descriptions that are significantly longer or shorter than the others;\n''' \ +'''8. The entire sequence of actions needs to be engaging and impressive;\n''' \ +'''9. If there are multiple subjects, describe the actions of each subject;\n''' \ +'''10. All the descriptions must be about appearance or action, and no sensory, feeling or highlighting descriptions are allowed;\n''' \ +'''11. Do not include any emphatic or highlighted descriptions such as demonstrating, showcasing, highlighting, emphasizing anything in the rewritten text;\n''' \ +'''12. All descriptions should be as objective as possible;\n''' \ +'''13. Unless the subject refers to multiple objects, use he/she/it instead of they to refer to the subject;\n''' \ +'''14. In describing actions, do not add adverbs before verbs, do not add prepositional phrase (e.g., with something) after verbs.\n''' \ +'''15. Output the entire prompt in English, retaining original text in quotes and titles, and preserve key input information;\n''' \ +'''16. If the original input is purely a landscape description, do not add additional subjects such as people;\n''' \ +'''17. Who does what action? Clarify the subject at the beginning of the sentence;\n''' \ +'''18. If you describe using the actions "Rolling", "raising hands", "lifting legs", etc., clarity whether they are forward or backward;\n''' \ +'''19. If the prompts specify the direction of movement, do not adjust it at will;\n''' \ +'''20. If describing a situation with multiple people, such as a group, state the number of people, but not too many, usually less than five;\n''' \ +'''21. If you are describing a person rotating, if you are describing the head turning, you should also describe the body turning as well;\n''' \ +'''22. When you describe actions, background and camera movement, you need to write them separately, don't mix them together;\n''' \ +'''23. If the original input is related to anime, you need to add "This video showcases an animated scene" at the beginning of your rewritten prompt. If there is a requirement for 2D or 3D animation, you should also include that information;\n''' \ +'''24. The revised prompt should be between 50-140 words long and always use simple and direct words.\n''' \ + +'''Original user inputs and the examples after rewriting:\n''' \ + +'''1. +- **Original user inputs**: A woman is swimming underwater in a pool, extending her arms forward and kicking her legs. The pool has colorful lane dividers in the background. She performs a somersault, rotating her body in the water. After the somersault, she continues swimming forward with a streamlined body position. +- **Examples after rewriting**: The video captures a female swimmer underwater, showing a sequence of movements that demonstrate various swimming techniques. Initially, the swimmer's body is horizontal, with the arms extended forward and the legs straightened, indicating the start of a stroke cycle. As the video progresses, the head turns slightly to the side, the arms begin to bend, and the legs begin to move in a rhythmic dolphin kick, creating a powerful propulsion mechanism. Throughout the frame, the swimmer maintains an streamlined position with minimal movement of the hands and feet to maintain balance and continue forward momentum. The swimmer is wearing a white bathing suit with pink floral patterns, her hair tied back to avoid obstructions. Bubbles can be seen around her body, indicating that she is moving through the water with fluid movements. In the background is a swimming pool lane marked by red and yellow lane lines that extend across the pool. The clear blue water reflects the sunlight, creating a serene and vibrant atmosphere. Another person is visible in the distance, partially obscured by the water's surface, swimming in the same lane. The swimmer's movements create small splashes and bubbles as she moves through the water. The camera follows her in front, highlighting the technique and form of the swimmer. The overall scene conveys a sense of focus and athleticism, highlighting the skill and grace of the swimmer in the water. The camera moves with the female athlete, keeping her in the center of the frame.\n''' \ + +'''2. +- **Original user inputs**: In an indoor tennis court, a man prepares to serve a tennis ball. He tosses the ball into the air and swings his racket to hit it. The opponent moves to the right to intercept the ball. The ball hits the net and falls to the ground on the opponent's side. The man in the white shirt follows through with his serve and then moves to the left side of the court. +- **Examples after rewriting**: The video captures a well-lit indoor tennis court with high ceilings and large windows that allow natural light to filter in. The court has a standard blue surface with white borders and a net in the middle. A male player in a white T-shirt and black shorts can be seen tossing a ball upward with his left hand and swinging his racket at the ball with his right hand. The opposing player in a white top and dark shorts then moves to the right of the frame, opens his racket, runs after the ball, and returns a powerful forehand shot that hits the net and lands. The background includes an advertisement for tennis equipment brand Babolat and a bench with practice tennis balls. The overall atmosphere reflects focused training in a professional environment. The camera focuses on the man, moving first to the left and then to the right.\n''' \ + +'''3. +- **Original user inputs**: A man wearing black athletic clothing and bright orange running shoes is running on a paved track. The background features a large, green grassy field with scattered trees and a few buildings in the distance. The sky is clear and blue, indicating a sunny day. The man maintains a steady pace throughout the sequence, with his arms bent at the elbows and his legs moving in a rhythmic motion. +- **Examples after rewriting**: The video captures a male runner in motion in an outdoor setting. The man is shirtless, wearing black running pants and bright orange running shoes. He has a black headband on his head and his hair is closely cropped. He runs on the road, swinging his hands back and forth at his sides as he runs, and his left and right legs alternate between landing and lifting on the road. The man maintains a consistent posture and runs at a constant speed, showing a continuous running posture. The background is a large green lawn that looks like a park or sports field, with trees scattered around the edges. In the distance, buildings that look like houses or apartments can be seen, indicating that the location is close to a residential area. The sky is clear and blue, indicating that it is a sunny day with good weather. The camera follows the man as he moves to the left.\n''' \ + +'''4. +- **Original user inputs**: In a large indoor arena with many people and banners, two individuals dressed in fencing gear are engaged in a fencing match. They are positioned on a white mat with a blue mat nearby. The fencers are seen lunging and parrying at each other with their swords. After a series of movements, they begin to separate and walk towards the right side of the frame. The fencer on the right raises their arm and points towards a man dressed in black standing near a table with a 'naked' banner. +- **Examples after rewriting**: The video captures an intense fencing match between two male fencers in a large indoor arena. The fencers are dressed in traditional white fencing gear, including masks, jackets, gloves, and trousers, with one fencer wearing a light-colored top and light-colored pants, a yellow and black socks and shoes, and a mask with a blue logo on it, while the other fencer has a mask with a red logo on it. They are both holding fencing foils in their hands and positioned on a white mat with a blue mat nearby, with bags and equipment scattered nearby. The male fencer on the left side of the frame, wearing a mask with a blue logo on it, leans back with his legs bent, his left hand extended forward holding the sword, and then he stands up straight and continues to attack, while the male fencer on the right side of the frame, wearing a mask with a red logo on it, stands there with his left foot in front and his right foot behind, his right hand extended forward holding the sword, and then both of them stand up straight and continue to attack and defend. The fencer on the right side of the frame actively moves forward with his right foot in front and his left foot behind, while the fencer on the left side of the frame extends his left leg and swings his left hand to attack, while the fencer on the right side of the frame swings his right hand to defend. The fencer on the left side of the frame then retracts his left hand and turns to leave, while the fencer on the right side of the frame turns around and walks away while raising his right hand and looking at the referee dressed in black near the table with the "Naked" banner. The background shows a busy arena with spectators sitting in the stands and other fencers or staff near the barriers. The banners on the fence read "USA Fencing" and "Naked", indicating that this is part of an official event. The high ceiling is supported by structural pillars, and bright lights illuminate the entire space. The atmosphere is focused and competitive, which is typical of competitive fencing events. The camera lens follows the two fencers as they move.\n''' \ + +'''5. +- **Original user inputs**: Players in red and white uniforms are actively engaged in the game, with the player in white number 5 dribbling the ball. The player in white number 5 continues to dribble and moves towards the basket, closely followed by players in red uniforms. The player in white number 5 attempts a shot while being closely guarded by the players in red. The player in white number 5 successfully makes a basket, and the players in red attempt to block the shot. The ball goes through the hoop, scoring a point. The players in red and white continue to move around the court, with the player in white number 5 preparing to make another move. +- **Examples after rewriting**: The video records a lively men's basketball game, which is played in an indoor gymnasium. The gymnasium has high ceilings and exposed metal beams. The court is marked with regulation lines, and the walls are painted white and decorated with colorful banners and a large red banner with yellow Chinese characters. A group of 10 players in red and white jerseys are actively engaged in the game. A player wearing a white No. 5 jersey is moving towards the basket with the ball, while his opponent in a red jersey is in hot pursuit. As the play progresses, the player with the ball fakes first to the right and then to the left, bypassing the defender and shooting towards the basket. The ball arcs through the air and enters the basket, while other players move to grab rebounds or assists. Another player in a red jersey jumps up to try to block the shot. Players on the sidelines watch the game intently, with referees or coaches nearby, with some of whom sit at tables holding referee's referee documents. The bright gym lights ensure that the game is clearly visible, highlighting the competitive nature of the sport and the strategic interactions between players. The camera follows the movement of player No. 5.\n''' \ + +'''6. +- **Original user inputs**: In an indoor badminton court with green walls and a balcony, two men are engaged in a badminton game. The man in black hits the shuttlecock, and the man in white returns it. +- **Examples after rewriting**: The video captures a dynamic game of badminton in an indoor court with a green floor and white boundary lines. The court is surrounded by green walls and has wooden floors and fixed badminton nets. There is a blue banner with white Chinese characters on the left wall and a brand logo on the right. Initially, the male player near the camera, wearing a black shirt and shorts, holds the racket in his right hand behind him and prepares to hit the ball, while the male player wearing a white shirt and dark shorts stands in a ready position on the opposite court. Both players are focused on the game, moving quickly and strategically. The black player hits the shuttlecock with the racket in his right hand extended forward with force, and the white player hits the shuttlecock with the racket in his right hand extended forward with force. Following the same pattern, the intensity of the game keeps them fully engaged, as they move across the court to return the shuttlecock and maintain their positions. Towards the end of the video, the shuttlecock appears to have been hit out of the court, indicated by the players' continued movements and gestures. In the background, there are several stationary exercise bikes against the wall and a few other people sitting or standing, perhaps observing or resting. The bright lights in the room highlight the action, emphasizing the speed and precision required for each shot. The video ends with the black player running to the left of the frame and the white player preparing to hit the shuttlecock with the racket in his right hand. The camera perspective does not change. The overall atmosphere reflects a competitive yet friendly sporting environment.\n''' \ + +'''7. +- **Original user inputs**: A man stands on a stone platform by the river. The man releases his grip and tucks his body into a forward roll. The man dives into the river, creating a splash upon entry. +- **Examples after rewriting**: The video captures a man diving into a stone pool from a stone platform. The man is shirtless, wearing black shorts with white stripes and a black hat. Initially, the man stands on one leg on the edge of the platform, stretching his arms upward, ready to stretch. He then lowers his arms and prepares to jump, swinging his arms back and forth as he begins to move. The man leaps forward, leaning forward, stretching his arms forward, and his feet alternately downward, completing a somersault before entering the water, with his arms extended upward at the top and your legs extended upward at the bottom. As he enters the water, he creates a splash. The other man, also shirtless and wearing black shorts, stands nearby, observing the diver. The pool is surrounded by stone walls and has a traditional architectural style, with trees and buildings visible in the background. The calm water reflects the sunlight and ripples, highlighting the contrast between stillness and the splash created by the diver's entry. The camera remains stationary, capturing the action as it unfolds against this serene yet dynamic backdrop.\n''' \ + +'''8. +- **Original user inputs**: In an indoor table tennis facility with multiple tables, a man and a woman are engaged in a game of table tennis. +- **Examples after rewriting**: The video captures an intense game of table tennis in an indoor gymnasium. The venue is spacious, with several tables and chairs neatly arranged in the background. The floor is covered with a red non-woven fabric, and the walls are painted beige and white, with overhead lights evenly illuminating the field. In the video, a man in a yellow shirt and black pants is playing against a woman wearing a white jacket, beige pants and white sneakers. At first, the man in yellow stands on the left side of the screen, leaning forward, holding the racket in right hand and stretching his arms forward, while the woman in white stands on the right side of the screen, ready to receive the ball. The man in yellow serves, and the woman in white receives the ball. Then the man in yellow and the woman in white both stand up straight and prepare to hit back. The two exchange blows successfully, and the game becomes fierce. The man in yellow hits a powerful ball, but the woman in white didn't miss it, and then she hits a powerful ball. The man in yellow hits the ball again, and the woman in white jumps up and hits a powerful ball, making the man in yellow raise his arm and give up. The fierce competitive atmosphere reflects in their focused expressions and agile movements, highlighting the dynamic nature of this sport. The camera angle remains fixed, providing a clear perspective of the action as it unfolds.\n''' \ + +'''9. +- **Original user inputs**: A woman stands on a blue gymnastics mat. She performs a series of spins and jumps, maintaining her balance and poise. +- **Examples after rewriting**: The video captures a female gymnast performing on a blue mat during a competition. The gymnast wears a black leotard with flowing pink patterns, which contrasts sharply against the blue mat. Her hair is tied into two small pigtails, showing her concentration and focus. First, she stretches her arms horizontally and stands in a squatting position, showing her flexibility and control. Then she turns her body while raising one arm and bending the other, showing her agility and grace. She leaps into the air, maintaining a dynamic pose with one leg extended forward and the other backward, before returning to a squatting position and completing a roll, rolling into a front flip in the air, spinning twice to land steadily. She then stands up straight and stretches her arms horizontally to complete her performance with a confident hand gesture and a smile. The venue is an indoor gymnasium, and spectators can be seen sitting in the stands in the background. Judges sit at a table with a banner that reads "OLAY", observing and evaluating the performance. The gymnast runs energetically across the mat, leaps into the air with precision, and transitions into complex movements with great skill. The audience remains focused throughout, highlighting the competitive atmosphere of the event. The camera follows her movements, capturing her in action.''' \ + diff --git a/Meissonic/InfinityStar/tools/run_infinity.py b/Meissonic/InfinityStar/tools/run_infinity.py new file mode 100644 index 0000000000000000000000000000000000000000..1f93872d30c10d226ae44b0c777f325161844f11 --- /dev/null +++ b/Meissonic/InfinityStar/tools/run_infinity.py @@ -0,0 +1,339 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +import os.path as osp +from typing import List +import time +import hashlib +import shutil +import re +import json +from typing import Dict + +import cv2 +import numpy as np +import torch +torch._dynamo.config.cache_size_limit=64 +from transformers import AutoTokenizer +from PIL import Image, ImageEnhance +import torch.nn.functional as F +from torch.cuda.amp import autocast +from timm.models import create_model +import imageio + +from infinity.models.infinity import Infinity +from infinity.utils.load import load_visual_tokenizer +from infinity.models.basic import * +import PIL.Image as PImage +from torchvision.transforms.functional import to_tensor +from huggingface_hub import split_torch_state_dict_into_shards +from safetensors.torch import save_file as safe_save_file + + +def split_state_dict(state_dict: Dict[str, torch.Tensor], save_directory: str, max_shard_size='8GB'): + state_dict_split = split_torch_state_dict_into_shards(state_dict, max_shard_size=max_shard_size) + for filename, tensors in state_dict_split.filename_to_tensors.items(): + shard = {tensor: state_dict[tensor] for tensor in tensors} + safe_save_file( + shard, + os.path.join(save_directory, filename), + metadata={"format": "pt"}, + ) + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + with open(os.path.join(save_directory, "model.safetensors.index.json"), "w") as f: + f.write(json.dumps(index, indent=2)) + +def extract_key_val(text): + pattern = r'<(.+?):(.+?)>' + matches = re.findall(pattern, text) + key_val = {} + for match in matches: + key_val[match[0]] = match[1].lstrip() + return key_val + +def encode_prompt(t5_path, text_tokenizer, text_encoder, prompt, enable_positive_prompt=False, low_vram_mode=False): + if enable_positive_prompt: + pass + print(f'prompt={prompt}') + captions = [prompt] + if 'flan-t5' in t5_path: + tokens = text_tokenizer(text=captions, max_length=512, padding='max_length', truncation=True, return_tensors='pt') + input_ids = tokens.input_ids.cuda(non_blocking=True) + mask = tokens.attention_mask.cuda(non_blocking=True) + text_features = text_encoder(input_ids=input_ids, attention_mask=mask)['last_hidden_state'].float() + lens: List[int] = mask.sum(dim=-1).tolist() + cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0)) + Ltext = max(lens) + kv_compact = [] + for len_i, feat_i in zip(lens, text_features.unbind(0)): + kv_compact.append(feat_i[:len_i]) + kv_compact = torch.cat(kv_compact, dim=0) + text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext) + else: + text_features = text_encoder(captions, 'cuda') + lens = [len(item) for item in text_features] + cu_seqlens_k = [0] + for len_i in lens: + cu_seqlens_k.append(cu_seqlens_k[-1] + len_i) + cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32) + Ltext = max(lens) + kv_compact = torch.cat(text_features, dim=0).float() + text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext) + return text_cond_tuple + +def gen_one_example( + infinity_test, + vae, + text_tokenizer, + text_encoder, + prompt, + cfg_list=[], + tau_list=[], + negative_prompt='', + scale_schedule=None, + top_k=900, + top_p=0.97, + cfg_sc=3, + cfg_exp_k=0.0, + cfg_insertion_layer=-5, + vae_type=0, + gumbel=0, + softmax_merge_topk=-1, + gt_leak=-1, + gt_ls_Bl=None, + g_seed=None, + sampling_per_bits=1, + enable_positive_prompt=0, + input_use_interplote_up=False, + low_vram_mode=False, + args=None, + get_visual_rope_embeds=None, + context_info=None, + noise_list=None, + return_summed_code_only=False, + mode='', + former_clip_features=None, + first_frame_features=None, +): + sstt = time.time() + if not isinstance(cfg_list, list): + cfg_list = [cfg_list] * len(scale_schedule) + if not isinstance(tau_list, list): + tau_list = [tau_list] * len(scale_schedule) + text_cond_tuple = encode_prompt(args.text_encoder_ckpt, text_tokenizer, text_encoder, prompt, enable_positive_prompt, low_vram_mode=low_vram_mode) + if negative_prompt: + negative_label_B_or_BLT = encode_prompt(args.text_encoder_ckpt, text_tokenizer, text_encoder, negative_prompt, low_vram_mode=low_vram_mode) + else: + negative_label_B_or_BLT = None + print(f'cfg: {cfg_list}, tau: {tau_list}') + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True): + stt = time.time() + out = infinity_test.autoregressive_infer( + vae=vae, + scale_schedule=scale_schedule, + label_B_or_BLT=text_cond_tuple, g_seed=g_seed, + B=1, negative_label_B_or_BLT=negative_label_B_or_BLT, force_gt_Bhw=None, + cfg_sc=cfg_sc, cfg_list=cfg_list, tau_list=tau_list, top_k=top_k, top_p=top_p, + returns_vemb=1, ratio_Bl1=None, gumbel=gumbel, norm_cfg=False, + cfg_exp_k=cfg_exp_k, cfg_insertion_layer=cfg_insertion_layer, + vae_type=vae_type, softmax_merge_topk=softmax_merge_topk, + ret_img=True, trunk_scale=1000, + gt_leak=gt_leak, gt_ls_Bl=gt_ls_Bl, inference_mode=True, + sampling_per_bits=sampling_per_bits, + input_use_interplote_up=input_use_interplote_up, + low_vram_mode=low_vram_mode, + args=args, + get_visual_rope_embeds=get_visual_rope_embeds, + context_info=context_info, + noise_list=noise_list, + return_summed_code_only=return_summed_code_only, + mode=mode, + former_clip_features=former_clip_features, + first_frame_features=first_frame_features, + ) + if return_summed_code_only: + return out + else: + pred_multi_scale_bit_labels, img_list = out + + print(f"cost: {time.time() - sstt}, infinity cost={time.time() - stt}") + img = img_list[0] + return img, pred_multi_scale_bit_labels + +def get_prompt_id(prompt): + md5 = hashlib.md5() + md5.update(prompt.encode('utf-8')) + prompt_id = md5.hexdigest() + return prompt_id + +def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_fsdp'): + print('[Save slim model]') + full_ckpt = torch.load(infinity_model_path, map_location=device) + infinity_slim = full_ckpt['trainer'][key] + # ema_state_dict = cpu_d['trainer'].get('gpt_ema_fsdp', state_dict) + if not save_file: + save_file = osp.splitext(infinity_model_path)[0] + '-slim.pth' + print(f'Save to {save_file}') + torch.save(infinity_slim, save_file) + print('[Save slim model] done') + return save_file + +def load_tokenizer(t5_path =''): + print(f'[Loading tokenizer and text encoder]') + if 'flan-t5' in t5_path: + from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast + text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True) + # text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.bfloat16) + text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16) + text_encoder.to('cuda') + text_encoder.eval() + text_encoder.requires_grad_(False) + else: + raise ValueError(f'Not support t5_path: {t5_path}') + return text_tokenizer, text_encoder + +def transform(pil_img, tgt_h, tgt_w): + width, height = pil_img.size + if width / height <= tgt_w / tgt_h: + resized_width = tgt_w + resized_height = int(tgt_w / (width / height)) + else: + resized_height = tgt_h + resized_width = int((width / height) * tgt_h) + pil_img = pil_img.resize((resized_width, resized_height), resample=PImage.LANCZOS) + # crop the center out + arr = np.array(pil_img) + crop_y = (arr.shape[0] - tgt_h) // 2 + crop_x = (arr.shape[1] - tgt_w) // 2 + im = to_tensor(arr[crop_y: crop_y + tgt_h, crop_x: crop_x + tgt_w]) + return im.add(im).add_(-1) + + +def load_transformer(vae, args): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model_path = args.model_path + + print(f'[Loading Infinity]') + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True), torch.no_grad(): + infinity_test: Infinity = create_model( + args.model_type, + vae_local=vae, text_channels=args.text_channels, text_maxlen=512, + raw_scale_schedule=None, + checkpointing='full-block', + pad_to_multiplier=128, + use_flex_attn=args.use_flex_attn, + add_lvl_embeding_on_first_block=0, + num_of_label_value=args.num_of_label_value, + rope2d_each_sa_layer=args.rope2d_each_sa_layer, + rope2d_normalized_by_hw=args.rope2d_normalized_by_hw, + pn=args.pn, + apply_spatial_patchify=args.apply_spatial_patchify, + inference_mode=True, + train_h_div_w_list=[0.571, 1.0], + video_frames=args.video_frames, + other_args=args, + ).to(device=device) + print(f'[you selected Infinity with {args.model_type}] model size: {sum(p.numel() for p in infinity_test.parameters())/1e9:.2f}B, bf16={args.bf16}') + if args.bf16: + for block in infinity_test.unregistered_blocks: + block.bfloat16() + infinity_test.eval() + infinity_test.requires_grad_(False) + infinity_test.cuda() + torch.cuda.empty_cache() + + if not model_path: + return infinity_test + + print(f'============== [Load Infinity weights] ==============') + if args.checkpoint_type == 'torch': + state_dict = torch.load(model_path, map_location=device) + if 'trainer' in state_dict: + print(infinity_test.load_state_dict(state_dict['trainer']['gpt_fsdp'])) + else: + print(infinity_test.load_state_dict(state_dict)) + elif args.checkpoint_type == 'torch_shard': + from transformers.modeling_utils import load_sharded_checkpoint + print(load_sharded_checkpoint(infinity_test, model_path, strict=False)) + elif args.checkpoint_type == 'omnistore': + from infinity.utils.save_and_load import merge_ckpt + if args.enable_model_cache and osp.exists(args.cache_dir): + local_model_dir = osp.abspath(osp.join(args.cache_dir, 'tmp', model_path.replace('/', '_'))) + else: + local_model_dir = osp.abspath(model_path) + print(f'load checkpoint from {local_model_dir}') + state_dict = merge_ckpt(local_model_dir, osp.join(local_model_dir, 'ouput'), save=False, fsdp_save_flatten_model=args.fsdp_save_flatten_model) + print(infinity_test.load_state_dict(state_dict)) + infinity_test.rng = torch.Generator(device=device) + return infinity_test + +def save_video(ndarray_image_list, fps=24, save_filepath='tmp.mp4'): + if len(ndarray_image_list) == 1: + save_filepath = save_filepath.replace('.mp4', '.jpg') + cv2.imwrite(save_filepath, ndarray_image_list[0]) + print(f"Image saved as {osp.abspath(save_filepath)}") + else: + h, w = ndarray_image_list[0].shape[:2] + os.makedirs(osp.dirname(save_filepath), exist_ok=True) + imageio.mimsave(save_filepath, ndarray_image_list[:, :, :, ::-1], fps=fps,) + print(f"Video saved as {osp.abspath(save_filepath)}") + +def read_video_as_frames(video_path): + if video_path.endswith('.jpg'): + return cv2.imread(video_path)[None, ...] + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + print(f"Error: Unable to open video file {video_path}") + return None + frames = [] + frame_count = 0 + while True: + ret, frame = cap.read() + if not ret: + break + frames.append(frame) + frame_count += 1 + cap.release() + frames = np.stack(frames) + return frames + +def add_common_arguments(parser): + parser.add_argument('--cfg', type=str, default='3') + parser.add_argument('--tau', type=float, default=1) + parser.add_argument('--pn', type=str, required=True, choices=['0.06M', '0.25M', '0.40M', '0.90M']) + parser.add_argument('--model_path', type=str, default='') + parser.add_argument('--cfg_insertion_layer', type=int, default=0) + parser.add_argument('--vae_type', type=int, default=64) + parser.add_argument('--vae_path', type=str, default='') + parser.add_argument('--add_lvl_embeding_on_first_block', type=int, default=0, choices=[0,1]) + parser.add_argument('--num_of_label_value', type=int, default=2) + parser.add_argument('--model_type', type=str, default='infinity_2b') + parser.add_argument('--rope2d_each_sa_layer', type=int, default=1, choices=[0,1]) + parser.add_argument('--rope2d_normalized_by_hw', type=int, default=2, choices=[0,1,2]) + parser.add_argument('--use_scale_schedule_embedding', type=int, default=0, choices=[0,1]) + parser.add_argument('--sampling_per_bits', type=int, default=1, choices=[1,2,4,8,16]) + parser.add_argument('--text_encoder_ckpt', type=str, default='') + parser.add_argument('--text_channels', type=int, default=2048) + parser.add_argument('--apply_spatial_patchify', type=int, default=0, choices=[0,1]) + parser.add_argument('--h_div_w_template', type=float, default=1.000) + parser.add_argument('--use_flex_attn', type=int, default=0, choices=[0,1]) + parser.add_argument('--enable_positive_prompt', type=int, default=0, choices=[0,1]) + parser.add_argument('--cache_dir', type=str, default='/dev/shm') + parser.add_argument('--enable_model_cache', type=int, default=0, choices=[0,1]) + parser.add_argument('--checkpoint_type', type=str, default='torch') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--bf16', type=int, default=1, choices=[0,1]) + parser.add_argument('--dynamic_scale_schedule', type=str, default='13_hand_craft') + parser.add_argument('--video_frames', type=int, default=81) + parser.add_argument('--videovae', type=int, default=10) + parser.add_argument('--fake_vae_input', type=int, default=0, choices=[0,1]) + parser.add_argument('--casual_multi_scale', type=int, default=0, choices=[0,1]) + parser.add_argument('--scale_embeds_num', type=int, default=128) + parser.add_argument('--train_h_div_w_list', type=float, default=None, nargs='+') + parser.add_argument('--mask_type', type=str, default='infinity_elegant_clip20frames_v2') + parser.add_argument('--context_frames', type=int, default=1000) diff --git a/Meissonic/InfinityStar/tools/save_dataset_features.py b/Meissonic/InfinityStar/tools/save_dataset_features.py new file mode 100644 index 0000000000000000000000000000000000000000..6a525105981a6424e93f27333f650997d84a598a --- /dev/null +++ b/Meissonic/InfinityStar/tools/save_dataset_features.py @@ -0,0 +1,209 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import os +import os.path as osp +import time +import gc +import json +import math +import random +import sys +import argparse +import copy +import traceback +import collections +from collections import deque +from contextlib import nullcontext +from functools import partial +from typing import List, Optional, Tuple +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ['XFORMERS_FORCE_DISABLE_TRITON'] = '1' +import threading + +import numpy as np +import torch +from torch.nn import functional as F +from torch.utils.data import DataLoader +import torch.distributed as tdist +import tqdm + +from tools.run_infinity import * +from infinity.dataset.dataset_joint_vi import JointViIterableDataset +from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + add_common_arguments(parser) + parser.add_argument('--reweight_loss_by_scale', type=int, default=1, choices=[0,1]) + parser.add_argument('--vis_model_flop_param', type=int, default=0, choices=[0,1]) + parser.add_argument('--image_data_path', type=str, default='') + parser.add_argument('--video_data_path', type=str, default='') + parser.add_argument('--video_batch_size', type=int, default=1) + parser.add_argument('--image_batch_size', type=int, default=1) + parser.add_argument('--dataloader_workers', type=int, default=12) + parser.add_argument('--noise_apply_layers', type=int, default=20) + parser.add_argument('--noise_apply_requant', type=int, default=1, choices=[0,1]) + parser.add_argument('--noise_apply_strength', type=float, default=0.2) + parser.add_argument('--debug_bsc', type=int, default=0, choices=[0,1]) + parser.add_argument('--log_freq', type=int, default=10) + parser.add_argument('--video_fps', type=int, default=24) + parser.add_argument('--steps_per_frame', type=int, default=4) + parser.add_argument('--video_tower_style', type=str, default='bottom') + parser.add_argument('--use_slice', type=int, default=1, choices=[0,1]) + parser.add_argument('--use_vae_token_cache', type=int, default=1, choices=[0,1]) + parser.add_argument('--allow_online_vae_feature_extraction', type=int, default=1, choices=[0,1]) + parser.add_argument('--use_text_token_cache', type=int, default=1, choices=[0,1]) + parser.add_argument('--image_batches_multiply', type=float, default=1) + parser.add_argument('--token_cache_dir', type=str, default='/mnt/bn/genai-data2/hanjian.thu123/vae_features') + parser.add_argument('--down_size_limit', type=int, default=10000) + parser.add_argument('--addition_pn006M', type=int, default=1, choices=[0,1]) + parser.add_argument('--addition_pn025M', type=int, default=1, choices=[0,1]) + parser.add_argument('--video_caption_type', type=str, default='tarsier2_caption') + parser.add_argument('--only_images4extract_feats', type=int, default=1, choices=[0,1]) + parser.add_argument('--temporal_compress_rate', type=int, default=4) + parser.add_argument('--cached_video_frames', type=int, default=81) + parser.add_argument('--duration_resolution', type=float, default=0.005) + parser.add_argument('--train_max_token_len', type=int, default=20480) + parser.add_argument('--cache_check_mode', type=int, default=0) + parser.add_argument('--seq_pack_bucket', type=int, default=1) + parser.add_argument('--drop_long_video', type=int, default=0) + parser.add_argument('--append_duration2caption', type=int, default=0) + parser.add_argument('--min_video_frames', type=int, default=32) + parser.add_argument('--addition_pn_list', type=str, default='[]') + parser.add_argument('--semantic_scale_dim', type=int, default=16) + parser.add_argument('--detail_scale_dim', type=int, default=64) + parser.add_argument('--use_learnable_dim_proj', type=int, default=0) + parser.add_argument('--detail_scale_min_tokens', type=int, default=80) + parser.add_argument('--semantic_scales', type=int, default=80) + + parser.add_argument('--tlen', type=int, default=512) + parser.add_argument('--manual_parallel', action="store_true") + parser.add_argument('--num_replicas', type=int, default=-1) # only valid when manual_parallel is True + parser.add_argument('--rank', type=int, default=-1) # only valid when manual_parallel is True + parser.add_argument('--restrict_data_size', type=int, default=-1) + parser.add_argument('--allow_less_one_elem_in_seq', type=int, default=1) + parser.add_argument('--use_feat_proj', type=int, default=2) + parser.add_argument('--use_two_stage_lfq', type=int, default=1) + parser.add_argument('--epoch', type=int, default=0) + + args = parser.parse_args() + + if args.manual_parallel: + device = "cuda:0" + num_replicas = args.num_replicas + rank = args.rank + assert num_replicas > 0 and rank >= 0 + else: + tdist.init_process_group(backend='nccl') + device = torch.device(tdist.get_rank() % torch.cuda.device_count()) + num_replicas = tdist.get_world_size() + rank=tdist.get_rank() + args.device = device + args.text_tokenizer = None + args.duration_resolution = 4 / args.video_fps + + # load vae + vae = load_visual_tokenizer(args, device=device) + + dataset = JointViIterableDataset( + image_meta_folder=args.image_data_path, + video_meta_folder=args.video_data_path, + max_caption_len=512, + short_prob=0.0, + load_vae_instead_of_image=False, + pn=args.pn, + seed=args.seed, + video_fps=args.video_fps, + num_frames=args.video_frames, + online_t5=True, + num_replicas=num_replicas, # 1, + rank=rank, # 0 + dataloader_workers=args.dataloader_workers, + dynamic_resolution_across_gpus=0, + enable_dynamic_length_prompt=0, + dynamic_scale_schedule=args.dynamic_scale_schedule, + add_motion_score2caption=0, + other_args=args, + ) + dataloader = DataLoader(dataset, batch_size=None, num_workers=args.dataloader_workers, pin_memory=True) + print(f'len(dataloader): {len(dataloader)}, len(dataset): {len(dataset)}') + t1 = time.time() + dataloader.dataset.set_epoch(0) + pbar = tqdm.tqdm(total=len(dataloader)) + accumulate_res = collections.defaultdict(list) + dynamic_resolution_h_w, h_div_w_templates = get_dynamic_resolution_meta(args.dynamic_scale_schedule, args.video_frames) + + print(device) + vae.to(device) + + def save_token(): + while True: + try: + raw_features, feature_cache_files4images = save_token_queue.get() + for i in range(len(feature_cache_files4images)): + if not osp.exists(feature_cache_files4images[i]): + os.makedirs(osp.dirname(feature_cache_files4images[i]), exist_ok=True) + torch.save(raw_features[i], feature_cache_files4images[i]) + print(f'Save to {feature_cache_files4images[i]}') + else: + print(f'{feature_cache_files4images[i]} exists, skip') + except Exception as e: + print(f"Error saving token: {e}") + finally: + save_token_queue.task_done() + + import queue + save_token_queue = queue.Queue() + saver = threading.Thread(target=save_token, daemon=True) + saver.start() + + data_time = time.time() + iter_time = time.time() + + pn_list = [args.pn] + json.loads(args.addition_pn_list) + pn_list = list(set(pn_list)) + + for i, data in enumerate(iter(dataloader)): + pbar.update(1) + # print(f"[step {i}]: iter time: {time.time() - iter_time:.2f}s, data time:{time.time() - data_time:.2f}s") + iter_time = time.time() + # print("data time", iter_time - data_time) + + captions, feature_cache_files4images, raw_features_bcthw = data['captions'], data['feature_cache_files4images'], data['raw_features_bcthw'] + # print(len(feature_cache_files4images)) + if args.only_images4extract_feats: + assert len(raw_features_bcthw) == 0 + if not len(feature_cache_files4images): + continue + + for pn_ind, pn in enumerate(pn_list): + if pn == args.pn: + inp_B3HW = data['images'] + else: + inp_B3HW = data['addition_pn_images'][f'img_T3HW_{pn}'] + try: + # print(f"args.pn:{args.pn}, pn:{pn}") + cur_feature_cache_files4images = [item.replace(f'pn_{args.pn}', f'pn_{pn}') for item in feature_cache_files4images] + except Exception as e: + import pdb; pdb.set_trace() + assert len(inp_B3HW) == len(cur_feature_cache_files4images) + + for images_CTHW, feature_save_file in zip(inp_B3HW, cur_feature_cache_files4images): + try: + pt = images_CTHW.shape[-3] + h_div_w = images_CTHW.shape[-2] / images_CTHW.shape[-1] + h_div_w_templates = np.array(list(dynamic_resolution_h_w.keys())) + h_div_w_template = h_div_w_templates[np.argmin(np.abs(h_div_w-h_div_w_templates))] + # [forward] + with torch.amp.autocast('cuda', enabled=False): + with torch.no_grad(): + raw_features, _, _ = vae.encode_for_raw_features(images_CTHW.unsqueeze(0).to(device), scale_schedule=None, slice=args.use_slice) + raw_features = raw_features.cpu().data + save_token_queue.put((raw_features, [feature_save_file])) + except Exception as e: + print(e) + data_time = time.time() + # print("iter time", data_time - iter_time) + save_token_queue.join() diff --git a/Meissonic/InfinityStar/train.py b/Meissonic/InfinityStar/train.py new file mode 100644 index 0000000000000000000000000000000000000000..8d7b131b82ae6ee4054045a5c6a34dcdadf1f76f --- /dev/null +++ b/Meissonic/InfinityStar/train.py @@ -0,0 +1,466 @@ +# Copyright (c) 2025 FoundationVision +# SPDX-License-Identifier: MIT +import gc +import json +import math +import os +import os.path as osp +import random +import sys +import time +import traceback +from collections import deque +from contextlib import nullcontext +from functools import partial +from distutils.util import strtobool +from typing import List, Optional, Tuple +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ['XFORMERS_FORCE_DISABLE_TRITON'] = '1' +# os.environ["TORCH_LOGS"] = "+dynamo" +# os.environ["TORCHDYNAMO_VERBOSE"] = '1' + +import numpy as np +import torch +torch._dynamo.config.cache_size_limit = 64 +from torch.nn import functional as F +from torch.profiler import record_function +from torch.utils.data import DataLoader +from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast +import torch.distributed as tdist + +import infinity.utils.dist as dist +from infinity.dataset.build import build_joint_dataset +from infinity.utils.save_and_load import CKPTSaver, omnistoreCheckpoint, auto_resume, omnistore_auto_resume +from infinity.models.ema import get_ema_model +from infinity.utils import arg_util, misc, wandb_utils +from infinity.trainer import get_trainer +# from infinity.utils.mfu.mfu import mfutool + +def build_everything_from_args(args: arg_util.Args, saver): + # set seed + args.set_initial_seed(benchmark=True) + # build tokenizer + print(f'Loading T5 from {args.t5_path}...') + if 'flan-t5' in args.t5_path: + from transformers import T5EncoderModel, T5TokenizerFast + text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(args.t5_path, revision=None, legacy=True) # text_tokenizer.model_max_length is 512 + text_tokenizer.model_max_length = args.tlen + text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(args.t5_path, torch_dtype=torch.float16) + text_encoder.to(args.device) + text_encoder.eval() + text_encoder.requires_grad_(False) + args.text_tokenizer_type = 'flan_t5' + args.text_tokenizer = text_tokenizer + else: # umt5 + raise ValueError("Only flan-t5 is supported now.") + + # build models. Note that here gpt is the causal VAR transformer which performs next scale prediciton with text guidance + vae_local, gpt_uncompiled, gpt_wo_ddp, gpt_ddp, gpt_wo_ddp_ema, gpt_ddp_ema, gpt_optim = build_model_optimizer(args) + + # IMPORTANT: import heavy package `InfinityTrainer` after the Dataloader object creation/iteration to avoid OOM + InfinityTrainer = get_trainer(args) + # build trainer + trainer = InfinityTrainer( + device=args.device, + raw_scale_schedule=args.scale_schedule, + vae_local=vae_local, + gpt_wo_ddp=gpt_wo_ddp, gpt=gpt_ddp, + gpt_opt=gpt_optim, + label_smooth=args.label_smooth, + zero=args.zero, + vae_type=args.vae_type, + reweight_loss_by_scale=args.reweight_loss_by_scale, + gpt_wo_ddp_ema=gpt_wo_ddp_ema, + gpt_ema=gpt_ddp_ema, + use_fsdp_model_ema=args.use_fsdp_model_ema, + other_args=args, + ) + + # auto resume from broken experiment + global_it = 0 + if args.checkpoint_type == 'torch': + auto_resume_info, start_ep, global_it, acc_str, _, trainer_state, _ = auto_resume(args, 'global_step_*') + if trainer_state is not None and len(trainer_state): + trainer.load_state_dict(trainer_state, strict=False, skip_vae=True) + elif args.checkpoint_type == 'omnistore': + resume_path, info = omnistore_auto_resume(args, 'global_step_*') + if not resume_path and args.rush_omnistore_resume: + resume_path = args.rush_omnistore_resume + if resume_path: + print(f"omnistore resume from {resume_path}", flush=True) + args_state, start_ep, start_it, global_it, acc_str, eval_milestone = saver.load(resume_path, fsdp_object=trainer.gpt, optimizer_object=trainer.gpt_opt.optimizer) + dist.barrier() + if args.rush_omnistore_resume == resume_path: + global_it = 0 + auto_resume_info, acc_str, eval_milestone, trainer_state, args_state = info, '[no acc str]', [], {}, {} + + del vae_local, gpt_uncompiled, gpt_wo_ddp, gpt_ddp, gpt_wo_ddp_ema, gpt_ddp_ema, gpt_optim + dist.barrier() + return text_tokenizer, text_encoder, trainer, global_it + + +def build_model_optimizer(args): + from torch.nn.parallel import DistributedDataParallel as DDP + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from infinity.models.infinity import Infinity, MultipleLayers + from infinity.models.init_param import init_weights + from infinity.utils.amp_opt import AmpOptimizer + from infinity.utils.lr_control import filter_params + from infinity.utils.load import build_vae_gpt + + # disable builtin initialization for speed + setattr(torch.nn.Linear, 'reset_parameters', lambda self: None) + setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None) + vae_local, gpt_wo_ddp = build_vae_gpt(args, device=args.model_init_device) + count_p = lambda m: sum(p.numel() for p in m.parameters()) / 1e6 + num_para = count_p(gpt_wo_ddp) + if num_para/1000 < 20: # < 20B + gpt_wo_ddp = gpt_wo_ddp.to('cuda') + + if args.tini < 0: + args.tini = math.sqrt(1 / gpt_wo_ddp.C / 3) + init_weights(gpt_wo_ddp, other_std=args.tini) + gpt_wo_ddp.special_init() + if args.use_fsdp_model_ema: + gpt_wo_ddp_ema = get_ema_model(gpt_wo_ddp) + else: + gpt_wo_ddp_ema = None + + if args.rush_resume: + print(f"{args.rush_resume=}") + cpu_d = torch.load(args.rush_resume, 'cpu') + if 'trainer' in cpu_d: + state_dict = cpu_d['trainer']['gpt_fsdp'] + ema_state_dict = cpu_d['trainer'].get('gpt_ema_fsdp', state_dict) + else: + state_dict = cpu_d + ema_state_dict = state_dict + def drop_unfit_weights(state_dict): + if 'word_embed.weight' in state_dict and (state_dict['word_embed.weight'].shape[1] != gpt_wo_ddp.word_embed.in_features): + print(f'[rush_resume] drop word_embed.weight') + del state_dict['word_embed.weight'] + if 'head.weight' in state_dict and (state_dict['head.weight'].shape[0] != gpt_wo_ddp.head.out_features): + print(f'[rush_resume] drop head.weight') + del state_dict['head.weight'] + if 'head.bias' in state_dict and (state_dict['head.bias'].shape[0] != gpt_wo_ddp.head.bias.shape[0]): + print(f'[rush_resume] drop head.bias') + del state_dict['head.bias'] + if 'text_proj_for_sos.ca.mat_kv.weight' in state_dict and \ + (state_dict['text_proj_for_sos.ca.mat_kv.weight'].shape != gpt_wo_ddp.text_proj_for_sos.ca.mat_kv.weight.shape): + print(f'[rush_resume] drop cfg_uncond') + del state_dict['cfg_uncond'] + for key in list(state_dict.keys()): + if 'text' in key: + del state_dict[key] + if 'semantic_head.weight' in state_dict: + print(f'[rush_resume] replace semantic_head with semantic_head2') + state_dict['semantic_head2.weight'] = state_dict['semantic_head.weight'] + state_dict['semantic_head2.bias'] = state_dict['semantic_head.bias'] + del state_dict['semantic_head.weight'] + del state_dict['semantic_head.bias'] + if 'semantic_head2.weight' in state_dict and (state_dict['semantic_head2.weight'].shape[0] != gpt_wo_ddp.semantic_head2.out_features): + print(f'[rush_resume] drop semantic_head2.weight, semantic_head2.bias') + del state_dict['semantic_head2.weight'] + del state_dict['semantic_head2.bias'] + return state_dict + print(gpt_wo_ddp.load_state_dict(drop_unfit_weights(state_dict), strict=False)) + if args.use_fsdp_model_ema: + gpt_wo_ddp_ema.load_state_dict(drop_unfit_weights(ema_state_dict), strict=False) + elif args.torchshard_resume: + from transformers.modeling_utils import load_sharded_checkpoint + load_sharded_checkpoint(gpt_wo_ddp, args.torchshard_resume, strict=False) + + ndim_dict = {name: para.ndim for name, para in gpt_wo_ddp.named_parameters() if para.requires_grad} + + print(f'[PT] GPT model = {gpt_wo_ddp}\n\n') + print(f'[PT][#para], GPT={num_para:.2f}\n\n') + + gpt_uncompiled = gpt_wo_ddp + + gpt_ddp_ema = None + if args.zero: + from torch.distributed.fsdp import ShardingStrategy + from torch.distributed.fsdp.wrap import ModuleWrapPolicy + from torch.distributed.device_mesh import init_device_mesh + + # use mix prec: https://github.com/pytorch/pytorch/issues/76607 + if gpt_wo_ddp.num_block_chunks == 1: # no chunks + auto_wrap_policy = ModuleWrapPolicy([type(gpt_wo_ddp.unregistered_blocks[0]), ]) + else: + auto_wrap_policy = ModuleWrapPolicy([MultipleLayers, ]) + + if args.enable_hybrid_shard: + sharding_strategy = ShardingStrategy.HYBRID_SHARD if args.zero == 3 else ShardingStrategy._HYBRID_SHARD_ZERO2 + world_size = dist.get_world_size() + assert world_size % args.inner_shard_degree == 0 + assert args.inner_shard_degree > 1 and args.inner_shard_degree < world_size + device_mesh = init_device_mesh('cuda', (world_size // args.inner_shard_degree, args.inner_shard_degree)) + else: + sharding_strategy = ShardingStrategy.FULL_SHARD if args.zero == 3 else ShardingStrategy.SHARD_GRAD_OP + device_mesh = None + print(f'{">" * 45 + " " * 5} FSDP INIT with {args.zero=} {sharding_strategy=} {auto_wrap_policy=} {" " * 5 + "<" * 45}', flush=True) + + if args.fsdp_init_device == 'cpu': + gpt_wo_ddp = gpt_wo_ddp.cpu() + + gpt_ddp: FSDP = FSDP( + gpt_wo_ddp, + device_id=dist.get_local_rank(), + sharding_strategy=sharding_strategy, + mixed_precision=None, + auto_wrap_policy=auto_wrap_policy, + use_orig_params=True, + sync_module_states=True, + limit_all_gathers=True, + device_mesh=device_mesh, + ).to(args.device) + + if args.use_fsdp_model_ema: + gpt_wo_ddp_ema = gpt_wo_ddp_ema.to(args.device) + gpt_ddp_ema: FSDP = FSDP( + gpt_wo_ddp_ema, + device_id=dist.get_local_rank(), + sharding_strategy=sharding_strategy, + mixed_precision=None, + auto_wrap_policy=auto_wrap_policy, + use_orig_params=args.fsdp_orig, + sync_module_states=True, + limit_all_gathers=True, + ) + else: + ddp_class = DDP if dist.initialized() else misc.NullDDP + gpt_ddp: DDP = ddp_class(gpt_wo_ddp, device_ids=[dist.get_local_rank()], find_unused_parameters=False, broadcast_buffers=False) + torch.cuda.synchronize() + + # =============== build optimizer =============== + nowd_keys = set() + if args.disable_weight_decay: + nowd_keys |= { + 'cls_token', 'start_token', 'task_token', 'cfg_uncond', + 'pos_embed', 'pos_1LC', 'pos_start', 'start_pos', 'lvl_embed', + 'gamma', 'beta', + 'ada_gss', 'moe_bias', + 'scale_mul', + 'text_proj_for_sos.ca.mat_q', + } + names, paras, para_groups = filter_params(gpt_ddp if args.zero else gpt_wo_ddp, ndim_dict, nowd_keys=nowd_keys) + del ndim_dict + if '_' in args.ada: + beta0, beta1 = map(float, args.ada.split('_')) + else: + beta0, beta1 = float(args.ada), -1 + + opt_clz = { + 'sgd': partial(torch.optim.SGD, momentum=beta0, nesterov=True), + 'adam': partial(torch.optim.AdamW, betas=(beta0, beta1), fused=args.fused_adam), + 'adamw': partial(torch.optim.AdamW, betas=(beta0, beta1), fused=args.fused_adam), + }[args.opt] + opt_kw = dict(lr=args.tlr, weight_decay=0) + if args.adam_eps: opt_kw['eps'] = args.adam_eps + print(f'[vgpt] optim={opt_clz}, opt_kw={opt_kw}\n') + gpt_optim = AmpOptimizer('gpt', args.fp16, opt_clz(params=para_groups, **opt_kw), gpt_ddp if args.zero else gpt_wo_ddp, args.r_accu, args.grad_clip, args.zero) + del names, paras, para_groups + return vae_local, gpt_uncompiled, gpt_wo_ddp, gpt_ddp, gpt_wo_ddp_ema, gpt_ddp_ema, gpt_optim + + +def build_dataset(args): + train_dataset = build_joint_dataset( + args, + args.data_path, + args.video_data_path, + max_caption_len=args.tlen, + short_prob=args.short_cap_prob, + load_vae_instead_of_image=False + ) + return train_dataset + +def main_train(args: arg_util.Args): + if args.checkpoint_type == 'torch': + saver = CKPTSaver(dist.is_master(), eval_milestone=None) + elif args.checkpoint_type == 'omnistore': + saver = omnistoreCheckpoint(eval_milestone=None) + else: + raise ValueError(f'{args.checkpoint_type=}') + ret = build_everything_from_args(args, saver) + + if ret is None: + return + + text_tokenizer, text_encoder, trainer, start_global_it = ret + gc.collect(), torch.cuda.empty_cache() + seg5 = np.linspace(1, args.epoch, 5+1, dtype=int).tolist() + + time.sleep(3), gc.collect(), torch.cuda.empty_cache(), time.sleep(3) + ep_lg = max(1, args.epoch // 10) if args.epoch <= 100 else max(1, args.epoch // 20) + + # ============================================= epoch loop begins ============================================= + # build wandb logger + if dist.is_master(): + wandb_utils.wandb.init(project=args.project_name, name=args.exp_name, config={}) + for ep in range(args.epoch): + # build data at each epoch to ensure read meta take effects for each dataloader worker + args.epoch = ep + + if ep == 0: + train_dataset = build_dataset(args) + iters_train = len(train_dataset) + start_ep = start_global_it // iters_train + start_it = start_global_it % iters_train + print(f'[PT info] from ep{start_ep} it{start_it} {iters_train=}=======> bed: {args.bed} <=======\n') + + if ep < start_ep: + continue + if ep > start_ep: + train_dataset = build_dataset(args) + iters_train = len(train_dataset) + + # [train one epoch] + train_dataloader = DataLoader(dataset=train_dataset, num_workers=args.workers, pin_memory=True, batch_size=None) + stats = train_one_epoch( + epoch=ep, + is_first_ep=ep == start_ep, + start_it=start_it if ep == start_ep else 0, + start_global_it=start_global_it, + me=None, + saver=saver, + args=args, + dataloader_iter=iter(train_dataloader), + iters_train=iters_train, + text_tokenizer=text_tokenizer, text_encoder=text_encoder, + trainer=trainer, + ) + + del stats, train_dataset, train_dataloader + return + + +g_speed_ls = deque(maxlen=128) +def train_one_epoch( + epoch: int, is_first_ep: bool, start_it: int, start_global_it: int, me: misc.MetricLogger, + saver: CKPTSaver, args: arg_util.Args, dataloader_iter, iters_train: int, + text_tokenizer: T5TokenizerFast, text_encoder: T5EncoderModel, trainer, +): + # IMPORTANT: import heavy packages after the Dataloader object creation/iteration to avoid OOM + step_cnt = 0 + header = f'[Ep]: [{epoch:4d}/{args.epoch}]' + + last_touch = time.time() + g_it, max_it = epoch * iters_train, args.epoch * iters_train + + doing_profiling = args.prof and epoch == 0 and (args.profall or dist.is_master()) + maybe_record_function = record_function if doing_profiling else nullcontext + trainer.gpt_wo_ddp.maybe_record_function = maybe_record_function + + last_t_perf = time.time() + speed_ls: deque = g_speed_ls + FREQ = min(args.prof_freq, iters_train//2-1) + NVIDIA_IT_PLUS_1 = set(FREQ*i for i in (1, 2, 3, 4, 6, 8)) + ranges = set([2 ** i for i in range(20)]) + if epoch <= 1: ranges |= {1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 32, 40} + PRINTABLE_IT_PLUS_1 = set(FREQ*i for i in ranges) + + me = misc.MetricLogger() + [me.add_meter(x, misc.SmoothedValue(window_size=1, fmt='{value:.2g}')) for x in ['tlr']] + [me.add_meter(x, misc.SmoothedValue(window_size=1, fmt='{median:.2f} ({global_avg:.2f})')) for x in ['tnm']] + [me.add_meter(x, misc.SmoothedValue(window_size=1, fmt='{median:.3f} ({global_avg:.3f})')) for x in ['L', 'L_i', 'L_v']] + [me.add_meter(x, misc.SmoothedValue(window_size=1, fmt='{median:.2f} ({global_avg:.2f})')) for x in ['Acc', 'Acc_i', 'Acc_v']] + [me.add_meter(x, misc.SmoothedValue(window_size=1, fmt='{median:.2f} ({global_avg:.2f})')) for x in ['seq_usage']] + # ============================================= iteration loop begins ============================================= + for it, data in me.log_every(start_it, iters_train, dataloader_iter, args.log_freq, args.log_every_iter, header, args): + g_it = epoch * iters_train + it + # mfutool.step() + # mfu_val = mfutool.get_mfu() * 100 # to percent + # print(f"[MFU] step={g_it}, mfu={mfu_val:.2f} %, mfu.iter_time = {mfutool.iter_time():.4f} s") + + + if (it+1) % FREQ == 0: + speed_ls.append((time.time() - last_t_perf) / FREQ) + last_t_perf = time.time() + + if (g_it+1) % args.save_model_iters_freq == 0: + if args.checkpoint_type == 'torch': + saver.sav(args=args, g_it=(g_it+1), next_ep=epoch, next_it=it+1, trainer=trainer, acc_str=f'[todo]', eval_milestone=None, also_save_to=None, best_save_to=None) + elif args.checkpoint_type == 'omnistore': + saver.sav(args=args, global_it=(g_it+1), next_ep=epoch, next_it=it+1, fsdp_object=trainer.gpt, optimizer_object=trainer.gpt_opt.optimizer, acc_str=None, eval_milestone=None) + + with maybe_record_function('before_train'): + # [get data] + images, captions, raw_features_bcthw, feature_cache_files4images, media = data['images'], data['captions'], data['raw_features_bcthw'], data['feature_cache_files4images'], data['media'] + + # # [prepare text features] + if args.text_tokenizer_type == 'flan_t5': + tokens = text_tokenizer(text=captions, max_length=text_tokenizer.model_max_length, padding='max_length', truncation=True, return_tensors='pt') # todo: put this into dataset + input_ids = tokens.input_ids.cuda(non_blocking=True) + mask = tokens.attention_mask.cuda(non_blocking=True) + text_features = text_encoder(input_ids=input_ids, attention_mask=mask)['last_hidden_state'].float() + lens: List[int] = mask.sum(dim=-1).tolist() + cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0)) + Ltext = max(lens) + kv_compact = [] + for text_ind, (len_i, feat_i) in enumerate(zip(lens, text_features.unbind(0))): + kv_compact.append(feat_i[:len_i]) + kv_compact = torch.cat(kv_compact, dim=0) + text_cond_tuple: Tuple[torch.FloatTensor, List[int], torch.LongTensor, int] = (kv_compact, lens, cu_seqlens_k, Ltext) + else: + text_features = text_encoder(captions, args.device) + lens = [len(item) for item in text_features] + cu_seqlens_k = [0] + for len_i in lens: + cu_seqlens_k.append(cu_seqlens_k[-1] + len_i) + cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32) + Ltext = max(lens) + kv_compact = torch.cat(text_features, dim=0).float() + text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext) + + if len(images): + images = [item.to(args.device, non_blocking=True) for item in images] + if len(raw_features_bcthw): + raw_features_bcthw = [item.to(args.device, non_blocking=True) for item in raw_features_bcthw] + + # [logging] + if dist.is_local_master() and (it >= start_it + 10) and (time.time() - last_touch > 90): + args.dump_log() + last_touch = time.time() + + # [get scheduled hyperparameters] + progress = g_it / (max_it - 1) + clip_decay_ratio = (0.3 ** (20 * progress) + 0.2) if args.cdec else 1 + + stepping = (g_it + 1) % args.ac == 0 + step_cnt += int(stepping) + + with maybe_record_function('in_training'): + grad_norm_t, scale_log2_t = trainer.train_step( + epoch=epoch, + it=it, + g_it=g_it, + stepping=stepping, + clip_decay_ratio=clip_decay_ratio, + metric_lg=me, + inp_B3HW=images, + raw_features_bcthw=raw_features_bcthw, + feature_cache_files4images=feature_cache_files4images, + text_cond_tuple=text_cond_tuple, + media=media, + args=args, + ) + + with maybe_record_function('after_train'): + me.update(tlr=args.tlr) + # ============================================= iteration loop ends ============================================= + + me.synchronize_between_processes() + return {k: meter.global_avg for k, meter in me.meters.items()} + + +def main(): + args: arg_util.Args = arg_util.init_dist_and_get_args() + main_train(args) + print(f'final args:\n\n{str(args)}') + args.dump_log() + if isinstance(sys.stdout, dist.BackupStreamToFile) and isinstance(sys.stderr, dist.BackupStreamToFile): + sys.stdout.close(), sys.stderr.close() + dist.barrier() + +if __name__ == '__main__': + main() diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/comparison.mp4 b/Meissonic/InfinityStar/vae_reconstruction_test/comparison.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..2f8af955e72b0c0884efc16bc69d83f98740cf42 --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/comparison.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c179a456120edfa8df2d54bf438270503f3210101efbc0acf096ef32e6a85e26 +size 596649 diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/comparison_grid.png b/Meissonic/InfinityStar/vae_reconstruction_test/comparison_grid.png new file mode 100644 index 0000000000000000000000000000000000000000..965824044f75d3638f6efaaad0cfcf816f06a826 --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/comparison_grid.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d026c6309def71e22a232ddc202461dc2276a3022880cbfc45c9b9090b9871bb +size 6041734 diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/frame_000_comparison.png b/Meissonic/InfinityStar/vae_reconstruction_test/frame_000_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..ac667fe706da3f88167748b9bccc97ac9e2bc117 --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/frame_000_comparison.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:290d7c3bf99413b965e42022bedc6eb3115486ea7bd7c21015e8f93e7fbbee61 +size 935646 diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/frame_001_comparison.png b/Meissonic/InfinityStar/vae_reconstruction_test/frame_001_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..ed808da5610da2d6875ab686cd08beacea8f84e5 --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/frame_001_comparison.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741e12e178a1b026e39c451ccddcbe0f55a95eb9e66287628e5d04fd77d75b0c +size 968011 diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/frame_002_comparison.png b/Meissonic/InfinityStar/vae_reconstruction_test/frame_002_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..cb70027393b03421cf502cd1408de7eb5fe397bc --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/frame_002_comparison.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd72eef725688bd6a113c118e4dff27c060d3f5dfc25cd5041033f32ee04b03 +size 979803 diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/frame_003_comparison.png b/Meissonic/InfinityStar/vae_reconstruction_test/frame_003_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..01788f8439bc410452a431db4626a7d3a7e6cf92 --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/frame_003_comparison.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc239ef47a83a0e511ee22bd704309f2e26a93ac1c4711875ae582504888df2e +size 984296 diff --git a/Meissonic/InfinityStar/vae_reconstruction_test/frame_004_comparison.png b/Meissonic/InfinityStar/vae_reconstruction_test/frame_004_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..95e67f1d240b781ed079e0e585d543c91c836df1 --- /dev/null +++ b/Meissonic/InfinityStar/vae_reconstruction_test/frame_004_comparison.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:808e9e50cf6ebc2b2c8c0ddf4381bd94038a29c6b93ff210d396c24cab97a843 +size 986298 diff --git a/Meissonic/LICENSE b/Meissonic/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/Meissonic/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Meissonic/README.md b/Meissonic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f9e69eaa3761b7a62ae13df9d2fa61a9814e316 --- /dev/null +++ b/Meissonic/README.md @@ -0,0 +1,222 @@ +# Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis + +
+Meissonic Banner + +[![arXiv](https://img.shields.io/badge/arXiv-2410.08261-b31b1b.svg)](https://arxiv.org/abs/2410.08261) +[![Hugging Face](https://img.shields.io/badge/🤗%20Huggingface-Model_Meissonic-yellow)](https://huggingface.co/MeissonFlow/Meissonic) +[![GitHub](https://img.shields.io/badge/GitHub-Repo-181717?logo=github)](https://github.com/viiika/Meissonic) +[![YouTube](https://img.shields.io/badge/YouTube-Tutorial_EN-FF0000?logo=youtube)](https://www.youtube.com/watch?v=PlmifElhr6M) +[![YouTube](https://img.shields.io/badge/YouTube-Tutorial_JA-FF0000?logo=youtube)](https://www.youtube.com/watch?v=rJDrf49wF64) +[![Demo](https://img.shields.io/badge/Live-Demo_Meissonic-blue?logo=huggingface)](https://huggingface.co/spaces/MeissonFlow/meissonic) +[![Replicate](https://replicate.com/chenxwh/meissonic/badge)](https://replicate.com/chenxwh/meissonic) + +[![Hugging Face](https://img.shields.io/badge/🤗%20Huggingface-Model_Monetico-yellow)](https://huggingface.co/Collov-Labs/Monetico) +[![Demo](https://img.shields.io/badge/Live-Demo_Monetico-blue?logo=huggingface)](https://huggingface.co/spaces/Collov-Labs/Monetico) + +[![arXiv](https://img.shields.io/badge/arXiv-2411.10781-b31b1b.svg)](https://arxiv.org/abs/2411.10781) + +[![arXiv](https://img.shields.io/badge/arXiv-2503.15457-b31b1b.svg)](https://arxiv.org/abs/2503.15457) +[![Hugging Face](https://img.shields.io/badge/🤗%20Huggingface-Model_DiMO-yellow)](https://huggingface.co/Yuanzhi/DiMO) +[![GitHub](https://img.shields.io/badge/GitHub-Repo-181717?logo=github)](https://github.com/yuanzhi-zhu/DiMO) + + +[![arXiv](https://img.shields.io/badge/arXiv-2505.23606-b31b1b.svg)](https://arxiv.org/abs/2505.23606) +[![Hugging Face](https://img.shields.io/badge/🤗%20Huggingface-Model_Muddit-yellow)](https://huggingface.co/MeissonFlow/Muddit) +[![GitHub](https://img.shields.io/badge/GitHub-Repo-181717?logo=github)](https://github.com/M-E-AGI-Lab/Muddit) +[![Demo](https://img.shields.io/badge/Live-Demo_Muddit-blue?logo=huggingface)](https://huggingface.co/spaces/MeissonFlow/muddit) + +[![arXiv](https://img.shields.io/badge/arXiv-2507.04947-b31b1b.svg)](https://arxiv.org/abs/2507.04947) + +[![arXiv](https://img.shields.io/badge/arXiv-2508.10684-b31b1b.svg)](https://arxiv.org/abs/2508.10684) + +[![arXiv](https://img.shields.io/badge/arXiv-2509.19244-b31b1b.svg)](https://arxiv.org/abs/2509.19244) +[![arXiv](https://img.shields.io/badge/arXiv-2509.23919-b31b1b.svg)](https://arxiv.org/abs/2509.23919) +[![arXiv](https://img.shields.io/badge/arXiv-2509.25171-b31b1b.svg)](https://arxiv.org/abs/2509.25171) + +[![arXiv](https://img.shields.io/badge/arXiv-2510.06308-b31b1b.svg)](https://arxiv.org/abs/2510.06308) + +[![arXiv](https://img.shields.io/badge/arXiv-2510.20668-b31b1b.svg)](https://arxiv.org/abs/2510.20668) [![GitHub](https://img.shields.io/badge/GitHub-Repo-181717?logo=github)](https://github.com/M-E-AGI-Lab/Awesome-World-Models) + +
+ +## 📝 Meissonic Updates and Family Papers + +- [MaskGIT: Masked Generative Image Transformer](https://arxiv.org/abs/2202.04200) [CVPR 2022] +- [Muse: Text-To-Image Generation via Masked Generative Transformers](https://arxiv.org/abs/2301.00704) [ICML 2023] +- [🌟][Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis](https://arxiv.org/abs/2410.08261) [ICLR 2025] +- [Bag of Design Choices for Inference of High-Resolution Masked Generative Transformer](https://arxiv.org/abs/2411.10781) +- [Di[𝙼]O: Distilling Masked Diffusion Models into One-step Generator](https://arxiv.org/abs/2503.15457) [ICCV 2025] +- [🌟][Muddit: Liberating Generation Beyond Text-to-Image with a Unified Discrete Diffusion Model](https://arxiv.org/abs/2505.23606) +- [DC-AR: Efficient Masked Autoregressive Image Generation with Deep Compression Hybrid Tokenizer](https://arxiv.org/pdf/2507.04947) [ICCV 2025] +- [MDNS: Masked Diffusion Neural Sampler via Stochastic Optimal Control](https://arxiv.org/abs/2508.10684) +- [Lavida-O: Elastic Large Masked Diffusion Models for Unified Multimodal Understanding and Generation](https://arxiv.org/abs/2509.19244) +- [🌟][Lumina-DiMOO: An Omni Diffusion Large Language Model for Multi-Modal Generation and Understanding](https://arxiv.org/abs/2510.06308) +- [Token Painter: Training-Free Text-Guided Image Inpainting via Mask Autoregressive Models](https://arxiv.org/abs/2509.23919) +- [TR2-D2: Tree Search Guided Trajectory-Aware Fine-Tuning for Discrete Diffusion](https://arxiv.org/abs/2509.25171) +- [OneFlow: Concurrent Mixed-Modal and Interleaved Generation with Edit Flows](https://arxiv.org/abs/2510.03506) +- [Diffuse Everything: Multimodal Diffusion Models on Arbitrary State Spaces](https://arxiv.org/abs/2506.07903) [ICML 2025] +- [Towards Better & Faster Autoregressive Image Generation: From the Perspective of Entropy](https://arxiv.org/abs/2510.09012) [NeurIPS 2025] +- [🌟][From Masks to Worlds: A Hitchhiker's Guide to World Models](https://arxiv.org/abs/2510.20668) +- [Soft-Di[M]O: Improving One-Step Discrete Image Generation with Soft Embeddings](https://arxiv.org/abs/2509.22925) + +- More papers are coming soon! +See [MeissonFlow Research](https://huggingface.co/MeissonFlow) (Organization Card) for more about our vision. + + +![Meissonic Demos](./assets/demos.png) + +## 🚀 Introduction + +Meissonic is a non-autoregressive mask image modeling text-to-image synthesis model that can generate high-resolution images. It is designed to run on consumer graphics cards. + +![Architecture](./assets/architecture.png) + +**Key Features:** +- 🖼️ High-resolution image generation (up to 1024x1024) +- 💻 Designed to run on consumer GPUs +- 🎨 Versatile applications: text-to-image, image-to-image + +## 🛠️ Prerequisites + +### Step 1: Clone the repository +```bash +git clone https://github.com/viiika/Meissonic/ +cd Meissonic +``` + +### Step 2: Create virtual environment +```bash +conda create --name meissonic python +conda activate meissonic +pip install -r requirements.txt +``` + +### Step 3: Install diffusers +```bash +git clone https://github.com/huggingface/diffusers.git +cd diffusers +pip install -e . +``` + +## 💡 Inference Usage + +### Gradio Web UI + +```bash +python app.py +``` + +### Command-line Interface + +#### Text-to-Image Generation + +```bash +python inference.py --prompt "Your creative prompt here" +``` + +#### Inpainting and Outpainting + +```bash +python inpaint.py --mode inpaint --input_image path/to/image.jpg +python inpaint.py --mode outpaint --input_image path/to/image.jpg +``` + +### Advanced: FP8 Quantization + +Optimize performance with FP8 quantization: + +Requirements: +- CUDA 12.4 +- PyTorch 2.4.1 +- TorchAO + +Note: Windows users install TorchAO using +```shell +pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cpu +``` + +Command-line inference +```shell +python inference_fp8.py --quantization fp8 +``` + +Gradio for FP8 (Select Quantization Method in Advanced settings) +```shell +python app_fp8.py +``` + +#### Performance Benchmarks + +| Precision (Steps=64, Resolution=1024x1024) | Batch Size=1 (Avg. Time) | Memory Usage | +|-------------------------------------------|--------------------------|--------------| +| FP32 | 13.32s | 12GB | +| FP16 | 12.35s | 9.5GB | +| FP8 | 12.93s | 8.7GB | + +## 🎨 Showcase + +
+ A pillow with a picture of a Husky on it. +

"A pillow with a picture of a Husky on it."

+
+ +
+ A white coffee mug, a solid black background +

"A white coffee mug, a solid black background"

+
+ +## 🎓 Training + +To train Meissonic, follow these steps: + +1. Install dependencies: + ```bash + cd train + pip install -r requirements.txt + ``` + +2. Download the [Meissonic](https://huggingface.co/MeissonFlow/Meissonic) base model from Hugging Face. + +3. Prepare your dataset: + - Use the sample dataset: [MeissonFlow/splash](https://huggingface.co/datasets/MeissonFlow/lemon/resolve/main/0000.parquet) + - Or prepare your own dataset and dataset class following the format in line 100 in [dataset_utils.py](./train/dataset_utils.py) and line 656-680 in [train_meissonic.py](./train/train_meissonic.py) + - Modify [train.sh](./train/train.sh) with your dataset path + +4. Start training: + ```bash + bash train/train.sh + ``` + +Note: For custom datasets, you'll likely need to implement your own dataset class. + + +## 📚 Citation + +If you find this work helpful, please consider citing: + +```bibtex +@article{bai2024meissonic, + title={Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis}, + author={Bai, Jinbin and Ye, Tian and Chow, Wei and Song, Enxin and Chen, Qing-Guo and Li, Xiangtai and Dong, Zhen and Zhu, Lei and Yan, Shuicheng}, + journal={arXiv preprint arXiv:2410.08261}, + year={2024} +} +``` + +## 🙏 Acknowledgements + +We thank the community and contributors for their invaluable support in developing Meissonic. We thank apolinario@multimodal.art for making Meissonic [Demo](https://huggingface.co/spaces/MeissonFlow/meissonic). We thank @NewGenAI and @飛鷹しずか@自称文系プログラマの勉強 for making YouTube tutorials. We thank @pprp for making fp8 and int4 quantization. We thank @camenduru for making [jupyter tutorial](https://github.com/camenduru/Meissonic-jupyter). We thank @chenxwh for making Replicate demo and api. We thank Collov Labs for reproducing [Monetico](https://huggingface.co/Collov-Labs/Monetico). We thank [Shitong et al.](https://arxiv.org/abs/2411.10781) for identifying effective design choices for enhancing visual quality. + + +--- + +

+ + Star History Chart + +

+ +

+ Made with ❤️ by the MeissonFlow Research +

diff --git a/Meissonic/VidTok/.gitignore b/Meissonic/VidTok/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..81849eac5c147c73d4202463658bd4b49571d163 --- /dev/null +++ b/Meissonic/VidTok/.gitignore @@ -0,0 +1,167 @@ +amlt +.amltconfig +checkpoints +logs +wandb +tmp + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/Meissonic/VidTok/CODE_OF_CONDUCT.md b/Meissonic/VidTok/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..f9ba8cf65f3e3104dd061c178066ec8247811f33 --- /dev/null +++ b/Meissonic/VidTok/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/Meissonic/VidTok/LICENSE b/Meissonic/VidTok/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9e841e7a26e4eb057b24511e7b92d42b257a80e5 --- /dev/null +++ b/Meissonic/VidTok/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/Meissonic/VidTok/README.md b/Meissonic/VidTok/README.md new file mode 100644 index 0000000000000000000000000000000000000000..edd9df5e2319e6dab35604d0d67192381dfacabf --- /dev/null +++ b/Meissonic/VidTok/README.md @@ -0,0 +1,472 @@ +
+ +# VidTok
A Family of Versatile and State-Of-The-Art Video Tokenizers + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2412.13061)   [![GitHub](https://img.shields.io/badge/GitHub-Code-blue?logo=github&logoColor=white)](https://github.com/microsoft/VidTok)   [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/microsoft/VidTok) +
+ +--- + +![radar](assets/radar.png) + + We introduce VidTok, a cutting-edge family of video tokenizers that excels in both continuous and discrete tokenizations. VidTok incorporates several key advancements over existing approaches: + * ⚡️ **Efficient Architecture**. Separate spatial and temporal sampling reduces computational complexity without sacrificing quality. + * 🔥 **Advanced Quantization**. Finite Scalar Quantization (FSQ) addresses training instability and codebook collapse in discrete tokenization. + * 💥 **Enhanced Training**. A two-stage strategy—pre-training on low-res videos and fine-tuning on high-res—boosts efficiency. Reduced frame rates improve motion dynamics representation. + +VidTok, trained on a large-scale video dataset, outperforms previous models across all metrics, including PSNR, SSIM, LPIPS, and FVD. + +https://github.com/user-attachments/assets/a3341037-130d-4a83-aba6-c3daeaf66932 + +## 🔥 News +- August, 2025: 🚀 Introduced spatial tiling for large resolutions (>256), reducing GPU memory usage to ~6 GB when encoding and decoding a 17 × 768 × 768 video. +* March, 2025: 🚀 [VidTwin](https://github.com/microsoft/VidTok/tree/main/vidtwin) has been accepted by CVPR 2025, and the [checkpoint](https://huggingface.co/microsoft/vidtwin) was released! +* March, 2025: 🚀 [VidTok v1.1](#-updates-in-vidtok-v11) was released! We fine-tuned all causal models on long videos to support tokenization and reconstruction of videos of arbitrary length with fine temporal smoothness. [Relevant checkpoints](https://huggingface.co/microsoft/VidTok/tree/main/checkpoints/vidtok_v1_1) are continuously updating. +* December, 2024: 🚀 [VidTwin](https://github.com/microsoft/VidTok/tree/main/vidtwin) was released! +* December, 2024: 🚀 [VidTok](https://github.com/microsoft/vidtok) was released! + + +## 💥 Updates in VidTok v1.1 +> VidTok v1.1 is an update for causal models. We fine-tuned all causal models on long videos to support tokenization and reconstruction of videos of arbitrary length with fine temporal smoothness. See performance [here](#v11-performance). + +### v1.1: Long Video Reconstruction +Run the following inference script to [reconstruct an input video](#reconstruct-an-input-video): +```bash +python scripts/inference_reconstruct.py --config CONFIG_v1_1 --ckpt CKPT_v1_1 --input_video_path VIDEO_PATH --input_height 256 --input_width 256 --sample_fps 30 --chunk_size CHUNK_SIZE --output_video_dir OUTPUT_DIR --read_long_video +# Set `CHUNK_SIZE` according to your GPU memory, recommendly 16. +``` +and run the following inference script to [evaluate the reconstruction performance](#performance-evaluation): +```bash +python scripts/inference_evaluate.py --config CONFIG_v1_1 --ckpt CKPT_v1_1 --data_dir DATA_DIR --input_height 256 --input_width 256 --sample_fps 30 --chunk_size CHUNK_SIZE --read_long_video +# Set `CHUNK_SIZE` according to your GPU memory, recommendly 16. +``` + +For an easy usage of VidTok v1.1 models, refer to [this script](#easy-usage) and make the following revision: +```python +# Use VidTok v1.1 models +cfg_path = "configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml" +ckpt_path = "checkpoints/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.ckpt" + +... + +model.to('cuda').eval() +# Using tiling inference to save memory usage +model.use_tiling = True +model.t_chunk_enc = 16 +model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor +model.use_overlap = True +# random input: long video +x_input = (torch.rand(1, 3, 129, 256, 256) * 2 - 1).to('cuda') + +... + +if x_recon.shape[2] != x_input.shape[2]: + x_recon = x_recon[:, :, -x_input.shape[2]:, ...] +``` + +### v1.1: Long Video Fine-tuning +Follow this [training guidance](#fine-tune-on-custom-data) to fine-tune on your custom long video data and note that: +- Compared to VidTok v1.0, we tend to use longer sequences to fine-tune the model (for example, setting `NUM_FRAMES_1` to 33, 49, or larger). +- The resolution and the sequence length of training data should be adjusted according to GPU memory. + +### v1.1: Performance +| Model | Regularizer | Causal | VCR | PSNR | SSIM | LPIPS | FVD | +|------|------|------|------|------|------|------|------| +| [vidtok_kl_causal_488_16chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt) | KL-16chn | ✔️ | 4x8x8 | 35.13 | 0.941 | 0.049 | 87.4 | +| [vidtok_kl_causal_41616_16chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.ckpt) | KL-16chn | ✔️ | 4x16x16 | 29.61 | 0.854 | 0.113 | 162.7 | +| [vidtok_kl_causal_288_8chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.ckpt) | KL-8chn | ✔️ | 2x8x8 | 34.59 | 0.935 | 0.051 | 78.2 | +| [vidtok_fsq_causal_488_32768_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.ckpt) | FSQ-32,768 | ✔️ | 4x8x8 | 29.39 | 0.856 | 0.114 | 168.5 | +| [vidtok_fsq_causal_888_32768_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.ckpt) | FSQ-32,768 | ✔️ | 8x8x8 | 27.95 | 0.817 | 0.142 | 293.2 | +- This is the evaluation result of long video reconstruction conducted on each complete video in [MCL_JCL](https://mcl.usc.edu/mcl-jcv-dataset/) dataset, with a sample fps of 30 and a resolution of `256x256`. + + +## 🔧 Setup +1. Clone this repository and navigate to VidTok folder: +```bash +git clone https://github.com/microsoft/VidTok +cd VidTok +``` +2. We provide an `environment.yaml` file for setting up a Conda environment. Conda's installation instructions are available [here](https://docs.anaconda.com/miniconda/index.html). +```bash +# 1. Prepare conda environment +conda env create -f environment.yaml +# 2. Activate the environment +conda activate vidtok +``` + +We recommend using 1+ high-end GPU for training and inference. We have done all testing and development using A100 and MI300X GPUs. For convenience, we also provide prebuilt [Docker](https://hub.docker.com/) images with required dependencies. You can use it as follows: + +```bash +# NVIDIA GPUs +docker run -it --gpus all --shm-size 256G --rm -v `pwd`:/workspace --workdir /workspace \ + deeptimhe/ubuntu22.04-cuda12.1-python3.10-pytorch2.5:orig-vidtok bash +# AMD GPUs +docker run -it --gpus all --shm-size 256G --rm -v `pwd`:/workspace --workdir /workspace \ + deeptimhe/ubuntu22.04-rocm6.2.4-python3.10-pytorch2.5:orig-vidtok bash +``` + +## 🎈 Checkpoints +Download pre-trained models [here](https://huggingface.co/microsoft/VidTok/tree/main/checkpoints), and put them in `checkpoints` folder, like: +``` +└── checkpoints + ├── vidtok_v1_1 + │ ├── vidtok_kl_causal_488_16chn_v1_1.ckpt + │ └── ... + ├── vidtok_fsq_causal_41616_262144.ckpt + ├── vidtok_fsq_causal_488_262144.ckpt + ├── vidtok_fsq_causal_488_32768.ckpt + ├── vidtok_fsq_causal_488_4096.ckpt + ├── vidtok_fsq_noncausal_41616_262144.ckpt + ├── vidtok_fsq_noncausal_488_262144.ckpt + ├── vidtok_kl_causal_288_8chn.ckpt + ├── vidtok_kl_causal_41616_4chn.ckpt + ├── vidtok_kl_causal_444_4chn.ckpt + ├── vidtok_kl_causal_488_16chn.ckpt + ├── vidtok_kl_causal_488_4chn.ckpt + ├── vidtok_kl_causal_488_8chn.ckpt + ├── vidtok_kl_noncausal_41616_16chn.ckpt + ├── vidtok_kl_noncausal_41616_4chn.ckpt + ├── vidtok_kl_noncausal_488_16chn.ckpt + └── vidtok_kl_noncausal_488_4chn.ckpt +``` +Each checkpoint has a corresponding config file with the same name in `configs` folder. + + +## 🔆 Performance + +| Model | Regularizer | Causal | VCR | PSNR | SSIM | LPIPS | FVD | +|------|------|------|------|------|------|------|------| +| [vidtok_kl_causal_488_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_4chn.ckpt) | KL-4chn | ✔️ | 4x8x8 | 29.64 | 0.852| 0.114| 194.2| +| [vidtok_kl_causal_488_8chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_8chn.ckpt) | KL-8chn | ✔️ |4x8x8 | 31.83 | 0.897| 0.083| 109.3| +| [vidtok_kl_causal_488_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_16chn.ckpt) | KL-16chn | ✔️ | 4x8x8 | 35.04 |0.942 |0.047 | 78.9| +| [vidtok_kl_causal_288_8chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_288_8chn.ckpt) | KL-8chn | ✔️ | 2x8x8 | 33.86 | 0.928 |0.057 | 80.7 | +| [vidtok_kl_causal_444_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_444_4chn.ckpt) | KL-4chn | ✔️ | 4x4x4 | 34.78 | 0.941 | 0.051| 87.2| +| [vidtok_kl_causal_41616_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_41616_4chn.ckpt) | KL-4chn | ✔️ | 4x16x16 | 25.05 | 0.711| 0.228| 549.1| +| [vidtok_kl_noncausal_488_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_488_4chn.ckpt) | KL-4chn | ✖️ | 4x8x8 | 30.60 | 0.876 | 0.098| 157.9| +| [vidtok_kl_noncausal_488_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_488_16chn.ckpt) | KL-16chn | ✖️ | 4x8x8 | 36.13 | 0.950 | 0.044| 60.5| +| [vidtok_kl_noncausal_41616_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_41616_4chn.ckpt) | KL-4chn | ✖️ | 4x16x16 | 26.06 | 0.751 | 0.190|423.2 | +| [vidtok_kl_noncausal_41616_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_41616_16chn.ckpt) | KL-16chn | ✖️ | 4x16x16 | 30.69 | 0.878 | 0.095| 147.1| +| [vidtok_fsq_causal_488_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_262144.ckpt) | FSQ-262,144 | ✔️ | 4x8x8 | 29.82 | 0.867 |0.106 | 160.1| +| [vidtok_fsq_causal_488_32768](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_32768.ckpt) | FSQ-32,768 | ✔️ | 4x8x8 | 29.16 | 0.854 | 0.117| 196.9| +| [vidtok_fsq_causal_488_4096](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_4096.ckpt) | FSQ-4096 | ✔️ | 4x8x8 | 28.36 | 0.832 | 0.133| 218.1| +| [vidtok_fsq_causal_41616_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_41616_262144.ckpt) | FSQ-262,144 | ✔️ | 4x16x16 | 25.38 | 0.738 |0.206 | 430.1| +| [vidtok_fsq_noncausal_488_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_noncausal_488_262144.ckpt) | FSQ-262,144 | ✖️ | 4x8x8 | 30.78 | 0.889| 0.091| 132.1| +| [vidtok_fsq_noncausal_41616_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_noncausal_41616_262144.ckpt) | FSQ-262,144 | ✖️ | 4x16x16 | 26.37 | 0.772| 0.171| 357.0| + +- `VCR` indicates the video compression ratio `TxHxW`. +- The above table shows model performance evaluated on 30 test videos in [MCL_JCL](https://mcl.usc.edu/mcl-jcv-dataset/) dataset, with a sample fps of 30. The input size is `17x256x256` for causal models and `16x256x256` for non-causal models. + +## 🔛 Training + +### Data Preparation +1. Put all training videos under `DATA_DIR`: +``` +└── DATA_DIR + ├── subset1 + │ ├── videoname11.mp4 + │ └── videoname12.mp4 + ├── subset2 + │ ├── videoname21.mp4 + │ ├── videoname22.mp4 + │ └── subsubset1 + │ ├── videoname211.mp4 + │ └── videoname212.mp4 + └── ... +``` +2. Prepare a `.csv` meta file to record the relative paths of these videos with respect to `DATA_DIR`, like: +``` +videos +subset1/videoname11.mp4 +subset2/videoname21.mp4 +subset2/subsubset1/videoname211.mp4 +``` + +> Validation data is also prepared following the above steps. + +### Fine-tune on Custom Data +1. Prepare your own training and validation data following [Data Preparation](#data-preparation). +2. Select the appropriate `CONFIG` file from `configs` folder based on your needs, and modify the following parameters: + - Specify the `ckpt_path` parameter to initialize the model with pre-trained checkpoint parameters: + ```yaml + model: + params: + ckpt_path: PATH_TO_CHECKPOINT # train from existing checkpoint + ``` + - Specify the `data` section to use your own training and validation data: + ```yaml + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: NUM_FRAMES_1 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_1 # sample fps for training data + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: NUM_FRAMES_2 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data + ``` + - Set `fix_encoder` and `fix_decoder` to be `False` to enable full model fine-tuning: + ```yaml + model: + params: + encoder_config: + params: + fix_encoder: false + fix_decoder: false + ``` + - Other hyperparameters according to your needs. + +3. Run the following command to start training: +```bash +python main.py -b CONFIG --logdir LOGDIR + +# You can also use `torchrun` to start the training code. +``` +Training logs and checkpoints are saved in `LOGDIR`. + +It is recommended to use [Weights & Biases](https://wandb.ai/site) as the data visualization tool ([TensorBoard](https://www.tensorflow.org/tensorboard) by default). Use `wandb login` to log in first, and then run: +```bash +python main.py -b CONFIG --logdir LOGDIR --wandb --wandb_entity ENTITY --wandb_project PROJECT +``` + + +### Train from Scratch +
+Two-stage Training +We adopt a two-stage training strategy to improve training efficiency: initially pre-training the full model on low-resolution videos, followed by fine-tuning only the decoder on high-resolution videos. + +| First Stage | Second Stage | Fix encoder | PSNR | SSIM | LPIPS | GPU Hours| +|------|------|------|------|------|------|------| +| 256 x 256 | - | - | 29.19 | 0.843 | 0.127| 3,072| +| 128 x 128 | 256 x 256 | ✔️ | 29.21 | 0.843 | 0.125| 1,536| + +1. Prepare your own training and validation data following [Data Preparation](#data-preparation). +2. Select the appropriate `CONFIG` file from `configs` folder based on your needs, and specify the `data` section to use your own training and validation data: + ```yaml + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 # vary in different training stages + input_width: INPUT_WIDTH_1 # vary in different training stages + sample_num_frames: NUM_FRAMES_1 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_1 # sample fps for training data + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: NUM_FRAMES_2 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data + ``` + +3. Start the first stage of training. First, revise the `CONFIG` file to enable full model training with low-resolution data: +```yaml +model: + params: + # ckpt_path: # disable this parameter so as to train from scratch + encoder_config: + params: + fix_encoder: false + fix_decoder: false +data: + params: + train: + params: + video_params: + input_height: 128 + input_width: 128 +``` +Then revise other hyperparameters according to your needs, and run the training command to start training as in [Fine-tune on Custom Data](#fine-tune-on-custom-data). We train VidTok for 50,000 steps with batch size 16 in this stage. + +4. Start the second stage of training. First, revise the `CONFIG` file to enable the fine-tuning of the decoder with high-resolution data: +```yaml +model: + params: + ckpt_path: CKPT_PATH # path to the saved checkpoint after the first stage of training + encoder_config: + params: + fix_encoder: true + fix_decoder: false +data: + params: + train: + params: + video_params: + input_height: 256 + input_width: 256 +``` +Then revise other hyperparameters according to your needs, and run the training command to start training as in [Fine-tune on Custom Data](#fine-tune-on-custom-data). We train VidTok for 30,000 steps with batch size 8 in this stage. +
+ + +## 🚀 Inference + +### Easy Usage +We provide the following example for a quick usage of our models. It works for both continuous and discrete tokenization and both causal and non-causal models. +Just provide the path to the configuration file `cfg_path` and checkpoint file `ckpt_path`. +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +cfg_path = "configs/vidtok_kl_causal_488_4chn.yaml" +ckpt_path = "checkpoints/vidtok_kl_causal_488_4chn.ckpt" + +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to('cuda').eval() +# random input +num_frames = 17 if model.is_causal else 16 +x_input = (torch.rand(1, 3, num_frames, 256, 256) * 2 - 1).to('cuda') # [B,C,T,H,W], range -1~1 +# model forward +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + _, x_recon, _ = model(x_input) +assert x_input.shape == x_recon.shape +``` +If you want to directly infer from latent tokens, run the following code: +```python +z, reg_log = model.encode(x_input, return_reg_log=True) +# infer from continuous latent space +x_recon = model.decode(z) +# infer from discrete latent tokens +x_recon = model.decode(reg_log['indices'], decode_from_indices=True) +``` + +### Use Torch Compile to Speed Up Inference +Use compiled components in VidTok can speed up inference by as much as 2X. The following code snippet demonstrates how to compile our models. + +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +torch._inductor.config.cpp.weight_prepack=True +torch._inductor.config.freezing=True + +cfg_path = "configs/vidtok_kl_causal_488_4chn.yaml" +ckpt_path = "checkpoints/vidtok_kl_causal_488_4chn.ckpt" + +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to('cuda').eval() +model.encoder = torch.compile(model.encoder) +model.decoder = torch.compile(model.decoder) + +# random input +num_frames = 17 if model.is_causal else 16 +x_input = (torch.rand(1, 3, num_frames, 256, 256) * 2 - 1).to('cuda') # [B,C,T,H,W], range -1~1 + +# Warm Up +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + _, x_recon, _ = model(x_input) + +torch.cuda.synchronize() +import time +start = time.time() +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + for i in range(10): + _, x_recon, _ = model(x_input) +torch.cuda.synchronize() +print(f"Average inference time: {(time.time() - start)/10 :.4f} seconds") +``` + +### Reconstruct an Input Video +```bash +python scripts/inference_reconstruct.py --config CONFIG --ckpt CKPT --input_video_path VIDEO_PATH --input_height 256 --input_width 256 --sample_fps 30 --output_video_dir OUTPUT_DIR +``` +- Specify `VIDEO_PATH` to the path of your test video. We provide an example video in `assets/example.mp4`. +- The reconstructed video is saved in `OUTPUT_DIR`. +- For causal models, you can choose to add `--pad_gen_frames` to the command line, which may improve the smoothness of the reconstructed video. + +### Performance Evaluation +We also provide a manuscript `scripts/inference_evaluate.py` to evaluate the video reconstruction performance in PSNR, SSIM and LPIPS. + +1. Put all of your test videos under `DATA_DIR`. +2. Run the following command, and all `.mp4` videos under `DATA_DIR` will be tested: +```bash +python scripts/inference_evaluate.py --config CONFIG --ckpt CKPT --data_dir DATA_DIR --input_height 256 --input_width 256 --sample_fps 30 +``` +(Optional) If you only want to test certain videos under `DATA_DIR`, you need to prepare a `.csv` meta file +to indicate the video files to be tested (refer to [Data Preparation](#data-preparation)). And add `--meta_path META_PATH` to the above command to specify the path to the `.csv` meta file. + +## 💡 Intended Uses + +We are sharing our model with the research community to foster further research in this area: +* Training your own video tokenizers for research purpose. +* Video tokenization with various compression rates. + + +## 🪧 Out-of-scope Uses + +Our models are not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of video tokenizers (e.g., performance degradation on out-of-domain data) as they select use cases, and evaluate and mitigate for privacy, safety, and fairness before using within a specific downstream use case, particularly for high-risk scenarios. + +Developers should be aware of and adhere to applicable laws or regulations (including privacy, trade compliance laws, etc.) that are relevant to their use case. + + +## 🤖️ Risks and Limitations + +Some of the limitations of this model to be aware of include: +* VidTok may lose detailed information on the reconstructed content. +* VidTok inherits any biases, errors, or omissions characteristic of its training data. +* VidTok was developed for research and experimental purposes. Further testing and validation are needed before considering its application in commercial or real-world scenarios. + + +## 🤗 Acknowledgments + +This codebase borrows code from [generative-models](https://github.com/Stability-AI/generative-models). We thank Stability AI for its efforts and innovations, which have made the development process more efficient and convenient. + +Thank you to everyone who contributed their wisdom and efforts to this project. + +## ✏️ BibTeX + +```bibtex +@article{tang2024vidtok, + title={VidTok: A Versatile and Open-Source Video Tokenizer}, + author={Tang, Anni and He, Tianyu and Guo, Junliang and Cheng, Xinle and Song, Li and Bian, Jiang}, + year={2024}, + journal={arXiv preprint arXiv:2412.13061}, +} +``` + +## ☎️ Contact + +We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at tianyuhe@microsoft.com. + +## 📄 Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + + +## 📍 Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/Meissonic/VidTok/SECURITY.md b/Meissonic/VidTok/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..b3c89efc852e22f71eabf5dfbc6ac62493425eb6 --- /dev/null +++ b/Meissonic/VidTok/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). + + diff --git a/Meissonic/VidTok/SUPPORT.md b/Meissonic/VidTok/SUPPORT.md new file mode 100644 index 0000000000000000000000000000000000000000..291d4d43733f4c15a81ff598ec1c99fd6c18f64c --- /dev/null +++ b/Meissonic/VidTok/SUPPORT.md @@ -0,0 +1,25 @@ +# TODO: The maintainer of this repo has not yet edited this file + +**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? + +- **No CSS support:** Fill out this template with information about how to file issues and get help. +- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. +- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. + +*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* + +# Support + +## How to file issues and get help + +This project uses GitHub Issues to track bugs and feature requests. Please search the existing +issues before filing new issues to avoid duplicates. For new issues, file your bug or +feature request as a new Issue. + +For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE +FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER +CHANNEL. WHERE WILL YOU HELP PEOPLE?**. + +## Microsoft Support Policy + +Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/Meissonic/VidTok/assets/example.mp4 b/Meissonic/VidTok/assets/example.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..21be26c8a8f5dbb76de5a225e5b284fdbb024904 --- /dev/null +++ b/Meissonic/VidTok/assets/example.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:588ca89fae7320a079d4f77cf963f88075959f06310594ab35d3e04b844c4d50 +size 540937 diff --git a/Meissonic/VidTok/assets/gemini.png b/Meissonic/VidTok/assets/gemini.png new file mode 100644 index 0000000000000000000000000000000000000000..fe326dc4b4b4b6db82575b15e9b4bf2f3b63e63d Binary files /dev/null and b/Meissonic/VidTok/assets/gemini.png differ diff --git a/Meissonic/VidTok/assets/radar.png b/Meissonic/VidTok/assets/radar.png new file mode 100644 index 0000000000000000000000000000000000000000..6eff9338928faf73c0e39e859237bc6c588ec183 --- /dev/null +++ b/Meissonic/VidTok/assets/radar.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cfdef783e26264ff671f81845b54471b237f5cd7df0dbd63642fde0c20f935e +size 424581 diff --git a/Meissonic/VidTok/assets/vidtwin.png b/Meissonic/VidTok/assets/vidtwin.png new file mode 100644 index 0000000000000000000000000000000000000000..a7ae41105384d077a7d128d1d687e3ee8e17fbd8 --- /dev/null +++ b/Meissonic/VidTok/assets/vidtwin.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e58210f7ca7784a4df737f8a2ece2b5e9f85fff3bf423b24aa6aeeb0b196cef +size 549933 diff --git a/Meissonic/VidTok/assets/vidtwin_demo.png b/Meissonic/VidTok/assets/vidtwin_demo.png new file mode 100644 index 0000000000000000000000000000000000000000..ca79126104a7cbfb67c0acb70f03c6e66546675c --- /dev/null +++ b/Meissonic/VidTok/assets/vidtwin_demo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8e15c46050cc7957bc5d334bd1902d58b9be9aa7669df82f2ae9ba08c90585 +size 6371846 diff --git a/Meissonic/VidTok/checkpoints/lpips/vgg.pth b/Meissonic/VidTok/checkpoints/lpips/vgg.pth new file mode 100644 index 0000000000000000000000000000000000000000..f57dcf5cc764d61c8a460365847fb2137ff0a62d --- /dev/null +++ b/Meissonic/VidTok/checkpoints/lpips/vgg.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a78928a0af1e5f0fcb1f3b9e8f8c3a2a5a3de244d830ad5c1feddc79b8432868 +size 7289 diff --git a/Meissonic/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml b/Meissonic/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..80b2edbebf97fd747933000d32ccbdfe0e135702 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_41616_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_fsq_causal_488_262144.yaml b/Meissonic/VidTok/configs/vidtok_fsq_causal_488_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26f9c3e94ed8b64a681b2ed3887929109b6ccc53 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_fsq_causal_488_262144.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_fsq_causal_488_32768.yaml b/Meissonic/VidTok/configs/vidtok_fsq_causal_488_32768.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e661c533b367c7be4eea8de629e074b395e1684 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_fsq_causal_488_32768.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_32768.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_fsq_causal_488_4096.yaml b/Meissonic/VidTok/configs/vidtok_fsq_causal_488_4096.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bf654bb21c1bd67ddde3f4878497c6ad780503a --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_fsq_causal_488_4096.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_4096.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8] # codebook size: 8*8*8*8=4096 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml b/Meissonic/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e400ff097155465e2a477bfbc3ca32f346a0ca12 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml @@ -0,0 +1,117 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_noncausal_41616_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml b/Meissonic/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c8731af435aae0efbde9716ee696ce3efd30d25 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml @@ -0,0 +1,117 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_noncausal_488_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_causal_288_8chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_causal_288_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c20bf1aaf2aa97f67d19d0de1dbac47cf2d55ac --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_causal_288_8chn.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_288_8chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [1] + tempo_us: [2] + time_downsample_factor: 2 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1db41cd612f5e6b1390bebda33e559e347b60907 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_41616_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_causal_444_4chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_causal_444_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bcbbd11cba27e60bd2f44866655adcfff91f91a6 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_causal_444_4chn.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_444_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + spatial_ds: [1, 2] + spatial_us: [1, 2] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_causal_488_16chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_causal_488_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7527679fe0766616df45223fe1e9595101fadc5 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_causal_488_16chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_causal_488_4chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_causal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d14b0d75435b24affad90095e4d6e42c6525a8d --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_causal_488_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_causal_488_8chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_causal_488_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..336cbac8319dc3f9c232c02889e598fdf098a777 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_causal_488_8chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_8chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b32064f1589eb3b21cb978f808a95fd6ade31ff9 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_41616_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdd33a5ebc635cbf9a04fb94006d14a473d87a3c --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_41616_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47a1090f42b07655626f0d522d756157fdeb1d6a --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_488_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml b/Meissonic/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05bb8c54df1374c0f67c79d256195dd6451acfbd --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74364fa3209abbda65b0c23311d92ad0975570dd --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml @@ -0,0 +1,120 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..334749091ff2c561208f169029eae4704d4213e3 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml @@ -0,0 +1,120 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2966072ff2a073d404d0cb438674acfc92319033 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml @@ -0,0 +1,122 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [0, 1, 2] + tempo_us: [1, 2, 3] + time_downsample_factor: 8 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..deefcc29ecd8212e8bd3edd9d9870d8c64079db7 --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml @@ -0,0 +1,116 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [1] + tempo_us: [2] + time_downsample_factor: 2 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..965b243859e513244d3d3fd9cc68aa27aee887da --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26f68342b48dd5fcae8e5bfcd2fcb5cc5bd1ab1c --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4badafed742c03fd7850f197cdb8207c59b992e --- /dev/null +++ b/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml b/Meissonic/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bcd59650c229ac57560602854015262969d658b --- /dev/null +++ b/Meissonic/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml @@ -0,0 +1,154 @@ +model: + base_learning_rate: 1.6e-4 + target: vidtwin.models.vidtwin_ae.VidAutoEncoderQformerCompactSymVidVAE + params: + input_key: jpg + monitor: val/rec_loss + ckpt_path: PATH_TO_CHECKPOINT + ignore_keys: [] + expect_ch: 8 + cont_num_blocks: 1 + downsample_motion: True + motion_num_blocks: 1 + d_dim: 8 + + temporal_qformer_config: + target: vidtwin.modules.qformer.MyQformerInterface + params: + num_query_tokens: 16 + query_hidden_size: 64 + encoder_hidden_size: 768 + + encoder_config: + target: vidtwin.modules.st_transformer.STTEncoder + params: + in_channels: 3 + input_size: [16, 224, 224] + patch_size: [1, 16, 16] + hidden_size: 768 + depth: 16 + num_heads: 12 + temporal_casual: true + + decoder_config: + target: vidtwin.modules.st_transformer.STTDecoder + params: + in_channels: 3 + input_size: [16, 224, 224] + patch_size: [1, 16, 16] + hidden_size: 768 + depth: 16 + num_heads: 12 + temporal_casual: true + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + perceptual_weight: 0.05 + disc_start: 20001 + disc_weight: 0.05 + learn_logvar: True + dims: 3 + disc_type: 2d + regularization_weights: + kl_loss: 0.001 + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + params: + sample: True + + + lr_scheduler_config_d: + target: vidtok.models.vidtwin_ae.LambdaWarmUpCosineScheduler + params: + lr_min: 0 + lr_max: 1.5e-05 + lr_start: 1.0e-05 + warmup_steps: 5000 + lr_scheduler_config_g: + target: vidtok.models.vidtwin_ae.LambdaWarmUpCosineScheduler + params: + lr_min: 0 + lr_max: 3.0e-05 + lr_start: 0 + warmup_steps: 5000 + optimizer_config: + target: torch.optim.AdamW + params: + betas: + - 0 + - 0.9 + weight_decay: 0.0001 + lr_scheduler_config: + target: inverse_sqrt + params: + num_warmup_steps: 2000 + frequency: 1 + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: 224 + input_width: 224 + sample_num_frames: 16 + sample_fps: 8 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: 224 + input_width: 224 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: True + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 2 + + + + trainer: + # precision: bf16-mixed # 16-mixed + benchmark: True + devices: 4 + num_sanity_val_steps: 10 + val_check_interval: 5000 + accumulate_grad_batches: 1 + max_epochs: 10 diff --git a/Meissonic/VidTok/environment.yaml b/Meissonic/VidTok/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6abf1dcc7cefdab27b272681c0395791b3d432f --- /dev/null +++ b/Meissonic/VidTok/environment.yaml @@ -0,0 +1,114 @@ +name: vidtok +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2024.11.26=h06a4308_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.15=h5eee18b_0 + - pip=24.2=py310h06a4308_0 + - python=3.10.15=he870216_1 + - readline=8.2=h5eee18b_0 + - setuptools=75.1.0=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h39e8969_0 + - wheel=0.44.0=py310h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.1.0 + - aiohappyeyeballs==2.4.4 + - aiohttp==3.11.9 + - aiosignal==1.3.1 + - antlr4-python3-runtime==4.9.3 + - appdirs==1.4.4 + - async-timeout==5.0.1 + - attrs==24.2.0 + - av==12.0.0 + - beartype==0.18.2 + - certifi==2024.8.30 + - charset-normalizer==3.4.0 + - click==8.1.7 + - contourpy==1.3.1 + - cycler==0.12.1 + - decord==0.6.0 + - docker-pycreds==0.4.0 + - einops==0.8.0 + - filelock==3.16.1 + - fonttools==4.55.1 + - frozenlist==1.5.0 + - fsspec==2024.10.0 + - gitdb==4.0.11 + - gitpython==3.1.43 + - grpcio==1.68.1 + - idna==3.10 + - imageio==2.34.0 + - jinja2==3.1.4 + - kiwisolver==1.4.7 + - lightning==2.2.4 + - lightning-utilities==0.11.9 + - markdown==3.7 + - markdown-it-py==3.0.0 + - markupsafe==3.0.2 + - matplotlib==3.8.4 + - mdurl==0.1.2 + - mpmath==1.3.0 + - multidict==6.1.0 + - natsort==8.4.0 + - networkx==3.4.2 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.19.3 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.1.105 + - omegaconf==2.3.0 + - opencv-python==4.6.0.66 + - packaging==24.2 + - pandas==2.1.4 + - pillow==11.0.0 + - propcache==0.2.1 + - protobuf==4.25.5 + - psutil==6.1.0 + - pygments==2.18.0 + - pyparsing==3.2.0 + - python-dateutil==2.9.0.post0 + - pytorch-lightning==2.2.4 + - pytz==2024.2 + - pyyaml==6.0.2 + - requests==2.32.3 + - rich==13.5.3 + - safetensors==0.4.2 + - sentry-sdk==2.19.0 + - setproctitle==1.3.4 + - six==1.17.0 + - smmap==5.0.1 + - sympy==1.13.3 + - tensorboard==2.16.2 + - tensorboard-data-server==0.7.2 + - torch==2.2.2 + - torchmetrics==1.6.0 + - torchvision==0.17.2 + - tqdm==4.67.1 + - triton==2.2.0 + - typing-extensions==4.12.2 + - tzdata==2024.2 + - urllib3==2.2.3 + - wandb==0.16.6 + - werkzeug==3.1.3 + - yarl==1.18.3 diff --git a/Meissonic/VidTok/main.py b/Meissonic/VidTok/main.py new file mode 100644 index 0000000000000000000000000000000000000000..e03f0de8a1c683fbdde7177083c87a0a3df85f83 --- /dev/null +++ b/Meissonic/VidTok/main.py @@ -0,0 +1,1124 @@ +import argparse +import datetime +import pytz +import glob +import inspect +import os +import re +import sys +import numpy as np +import warnings +warnings.filterwarnings("ignore") +from rich import print +from inspect import Parameter +from typing import Union +from matplotlib import pyplot as plt +from natsort import natsorted +from omegaconf import OmegaConf +from packaging import version +from PIL import Image +from pathlib import Path + +import torch +import torch.distributed as dist +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch import seed_everything +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +from vidtok.modules.util import (exists, instantiate_from_config, isheatmap, + print0, seed_anything) + +MULTINODE_HACKS = True + + +def default_trainer_args(): + argspec = dict(inspect.signature(Trainer.__init__).parameters) + argspec.pop("self") + default_args = { + param: argspec[param].default + for param in argspec + if argspec[param] != Parameter.empty + } + return default_args + + +def get_step_value(folder_name): + match = re.search(r"step=(\d+)", folder_name) + if match: + return int(match.group(1)) + return 0 + + +def get_parser(**parser_kwargs): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser(**parser_kwargs) + parser.add_argument( + "-n", + "--name", + type=str, + const=True, + default="", + nargs="?", + help="postfix for logdir", + ) + parser.add_argument( + "--no_date", + type=str2bool, + nargs="?", + const=True, + default=False, + help="if True, skip date generation for logdir and only use naming via opt.base or opt.name (+ opt.postfix, optionally)", + ) + parser.add_argument( + "-r", + "--resume", + type=str, + const=True, + default="", + nargs="?", + help="resume from logdir or checkpoint in logdir", + ) + parser.add_argument( + "-b", + "--base", + nargs="*", + metavar="base_config.yaml", + help="paths to base configs. Loaded from left-to-right. " + "Parameters can be overwritten or added with command-line options of the form `--key value`.", + default=list(), + ) + parser.add_argument( + "-t", + "--train", + type=str2bool, + const=True, + default=True, + nargs="?", + help="train", + ) + parser.add_argument( + "--no-test", + type=str2bool, + const=True, + default=True, + nargs="?", + help="disable test", + ) + parser.add_argument( + "-p", "--project", help="name of new or path to existing project" + ) + parser.add_argument( + "-d", + "--debug", + type=str2bool, + nargs="?", + const=True, + default=False, + help="enable post-mortem debugging", + ) + parser.add_argument( + "-s", + "--seed", + type=int, + default=23, + help="seed for seed_everything", + ) + parser.add_argument( + "--seed_rank", + type=str2bool, + nargs="?", + const=True, + default=False, + help="reset seed every rank on fit start", + ) + parser.add_argument( + "-f", + "--postfix", + type=str, + default="", + help="post-postfix for default name", + ) + parser.add_argument( + "-l", + "--logdir", + type=str, + default="logs", + help="directory for logging dat shit", + ) + parser.add_argument( + "--scale_lr", + type=str2bool, + nargs="?", + const=True, + default=False, + help="scale base-lr by ngpu * batch_size * n_accumulate", + ) + parser.add_argument( + "--legacy_naming", + type=str2bool, + nargs="?", + const=True, + default=False, + help="name run based on config file name if true, else by whole path", + ) + parser.add_argument( + "--enable_tf32", + type=str2bool, + nargs="?", + const=True, + default=True, + help="enables the TensorFloat32 format both for matmuls and cuDNN for pytorch 1.12", + ) + parser.add_argument( + "--startup", + type=str, + default=None, + help="Startuptime from distributed script", + ) + parser.add_argument( + "--wandb", + type=str2bool, + nargs="?", + const=True, + default=False, + help="log to wandb", + ) + parser.add_argument( + "--wandb_entity", + type=str, + default="", + help="Wandb entity name string", + ) + parser.add_argument( + "--wandb_key", + type=str, + default="", + help="Wandb key", + ) + parser.add_argument( + "--wandb_project", + type=str, + default="vidtok", + ) + parser.add_argument( + "--wandb_id", + type=str, + default=None, + help="automatically resume from the same wandb id" + "must be used in combination with --wandb_auto_resume False", + ) + parser.add_argument( + "--wandb_auto_resume", + type=str2bool, + nargs="?", + const=True, + default=True, + help="will find the latest run id in the logdir" + "if checkpoint_auto_resume is False, wandb_auto_resume will be ignored", + ) + parser.add_argument( + "--checkpoint_auto_resume", + type=str2bool, + nargs="?", + const=True, + default=True, + help="will find the latest checkpoint in the logdir" + "if checkpoint_auto_resume is False, wandb_auto_resume will be ignored", + ) + parser.add_argument( + "--no_base_name", + type=str2bool, + nargs="?", + const=True, + default=False, # TODO: later default to True + help="log to wandb", + ) + if version.parse(torch.__version__) >= version.parse("2.0.0"): + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="single checkpoint file to resume from", + ) + default_args = default_trainer_args() + for key in default_args: + # parameters in the pl.Trainer are passed as --key value + parser.add_argument("--" + key, default=default_args[key]) + return parser + + +def get_checkpoint_name(logdir): + ckpt = os.path.join(logdir, "checkpoints", "last**.ckpt") + ckpt = natsorted(glob.glob(ckpt)) + print0('available "last" checkpoints:') + print0(ckpt) + if len(ckpt) > 1: + print0("got most recent checkpoint") + ckpt = sorted(ckpt, key=lambda x: os.path.getmtime(x))[-1] + print0(f"Most recent ckpt is {ckpt}") + with open(os.path.join(logdir, "most_recent_ckpt.txt"), "w") as f: + f.write(ckpt + "\n") + try: + version = int(ckpt.split("/")[-1].split("-v")[-1].split(".")[0]) + except Exception as e: + print0("version confusion but not bad") + print0(e) + version = 1 + # version = last_version + 1 + else: + # in this case, we only have one "last.ckpt" + ckpt = ckpt[0] + version = 1 + melk_ckpt_name = f"last-v{version}.ckpt" + print0(f"Current melk ckpt name: {melk_ckpt_name}") + return ckpt, melk_ckpt_name + + +class SetupCallback(Callback): + def __init__( + self, + resume, + now, + logdir, + ckptdir, + cfgdir, + config, + lightning_config, + debug, + save_ckpt_on_exception=False, + ckpt_name=None, + seed=None, + seed_rank=False, + ): + super().__init__() + self.resume = resume + self.now = now + self.logdir = logdir + self.ckptdir = ckptdir + self.cfgdir = cfgdir + self.config = config + self.lightning_config = lightning_config + self.debug = debug + self.save_ckpt_on_exception = save_ckpt_on_exception + self.ckpt_name = ckpt_name + self.seed = seed + self.seed_rank = seed_rank + + def on_exception(self, trainer: pl.Trainer, pl_module, exception): + if self.save_ckpt_on_exception and (not self.debug) and (trainer.global_rank == 0): + print0(f"[bold red]\[main][SetupCallback][/bold red] Saving checkpoint to {self.ckptdir}") + if self.ckpt_name is None: + ckpt_path = os.path.join(self.ckptdir, "last.ckpt") + else: + ckpt_path = os.path.join(self.ckptdir, self.ckpt_name) + trainer.save_checkpoint(ckpt_path) + + def on_fit_start(self, trainer, pl_module): + if self.seed_rank: + # current_seed = torch.initial_seed() + seed_anything(self.seed + trainer.global_rank) + print(f"[bold red]\[main][SetupCallback][/bold red] Rank {trainer.global_rank}: Reset GLOBAL seed to {self.seed + trainer.global_rank}") + elif hasattr(pl_module, "set_seed") and callable(pl_module.set_seed): + pl_module.set_seed(self.seed) + print0(f"[bold red]\[main][SetupCallback][/bold red] Set pl_module seed to {self.seed} with pl_module.set_seed") + if trainer.global_rank == 0: + # Create logdirs and save configs + print0(f"[bold red]\[main][SetupCallback][/bold red] Creating logdir: {self.logdir}, ckptdir: {self.ckptdir}, cfgdir: {self.cfgdir}") + os.makedirs(self.logdir, exist_ok=True) + os.makedirs(self.ckptdir, exist_ok=True) + os.makedirs(self.cfgdir, exist_ok=True) + + if "callbacks" in self.lightning_config: + if ( + "metrics_over_trainsteps_checkpoint" + in self.lightning_config["callbacks"] + ): + os.makedirs( + os.path.join(self.ckptdir, "trainstep_checkpoints"), + exist_ok=True, + ) + print0("[bold red]\[main][SetupCallback][/bold red] Project config") + print0(OmegaConf.to_yaml(self.config)) + if MULTINODE_HACKS and not self.debug: + import time + time.sleep(5) + OmegaConf.save( + self.config, + os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)), + ) + + print0("[bold red]\[main][SetupCallback][/bold red] Lightning config") + print0(OmegaConf.to_yaml(self.lightning_config)) + OmegaConf.save( + OmegaConf.create({"lightning": self.lightning_config}), + os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)), + ) + + else: + # ModelCheckpoint callback created log directory --- remove it + if not MULTINODE_HACKS and not self.resume and os.path.exists(self.logdir): + dst, name = os.path.split(self.logdir) + dst = os.path.join(dst, "child_runs", name) + os.makedirs(os.path.split(dst)[0], exist_ok=True) + try: + os.rename(self.logdir, dst) + except FileNotFoundError: + pass + + +class ImageLogger(Callback): + def __init__( + self, + batch_frequency, + max_samples, + clamp=True, + increase_log_steps=True, + rescale=True, + disabled=True, + log_on_batch_idx=False, + log_first_step=False, + log_images_kwargs=None, + log_before_first_step=False, + enable_autocast=True, + ): + super().__init__() + self.enable_autocast = enable_autocast + self.rescale = rescale + self.batch_freq = batch_frequency + self.max_samples = max_samples + self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)] + if not increase_log_steps: + self.log_steps = [self.batch_freq] + self.clamp = clamp + self.disabled = disabled + self.log_on_batch_idx = log_on_batch_idx + self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} + self.log_first_step = log_first_step + self.log_before_first_step = log_before_first_step + + @rank_zero_only + def log_local( + self, + save_dir, + split, + images, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "images", split) + for k in images: + if isheatmap(images[k]): + fig, ax = plt.subplots() + ax = ax.matshow( + images[k].cpu().numpy(), cmap="hot", interpolation="lanczos" + ) + plt.colorbar(ax) + plt.axis("off") + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + plt.savefig(path) + plt.close() + # TODO: support wandb + else: + grid = torchvision.utils.make_grid(images[k], nrow=4) + if self.rescale: + grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) + grid = grid.numpy() + grid = (grid * 255).astype(np.uint8) + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + path = os.path.join(root, filename) + os.makedirs(os.path.split(path)[0], exist_ok=True) + img = Image.fromarray(grid) + img.save(path) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "logger_log_image only supports WandbLogger currently" + pl_module.logger.log_image( + key=f"{split}/{k}", + images=[ + img, + ], + step=pl_module.global_step, + ) + + @rank_zero_only + def log_img(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + self.check_frequency(check_idx) + and hasattr(pl_module, "log_images") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_images) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + gpu_autocast_kwargs = { + "enabled": self.enable_autocast, # torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled(), + } + with torch.no_grad(), torch.cuda.amp.autocast(**gpu_autocast_kwargs): + images = pl_module.log_images( + batch, split=split, **self.log_images_kwargs + ) + + for k in images: + N = min(images[k].shape[0], self.max_samples) + if not isheatmap(images[k]): + images[k] = images[k][:N] + if isinstance(images[k], torch.Tensor): + images[k] = images[k].detach().float().cpu() + if self.clamp and not isheatmap(images[k]): + images[k] = torch.clamp(images[k], -1.0, 1.0) + + self.log_local( + pl_module.logger.save_dir, + split, + images, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + def check_frequency(self, check_idx): + if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( + check_idx > 0 or self.log_first_step + ): + try: + self.log_steps.pop(0) + except IndexError as e: + print0("[bold red]\[main][ImageLogger][/bold red]", e) + pass + return True + return False + + @rank_zero_only + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): + self.log_img(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + if self.log_before_first_step and pl_module.global_step == 0: + print0(f"[bold red]\[main][ImageLogger][/bold red] {self.__class__.__name__}: logging before training") + self.log_img(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs + ): + if not self.disabled and pl_module.global_step > 0: + self.log_img(pl_module, batch, batch_idx, split="val") + if hasattr(pl_module, "calibrate_grad_norm"): + if ( + pl_module.calibrate_grad_norm and batch_idx % 25 == 0 + ) and batch_idx > 0: + self.log_gradients(trainer, pl_module, batch_idx=batch_idx) + + +@rank_zero_only +def init_wandb(save_dir, opt, config, group_name, name_str): + print0(f"[bold red]\[main][init_wandb][/bold red] Creating WANDB_DIR: {save_dir}") + os.makedirs(save_dir, exist_ok=True) + + # os.environ["WANDB_DIR"] = save_dir + gitcmd = f'git config --global --add safe.directory {os.path.dirname(os.path.abspath(__file__))}' + os.system(gitcmd) + print0(f"[bold red]\[main][init_wandb][/bold red] wandb_id is set to {opt.wandb_id}") + wandb_id = opt.wandb_id if opt.wandb_id is not None else name_str + + if not wandb.api.api_key: + wandb.login(key=opt.wandb_key) + if opt.debug: + wandb.init(project=opt.wandb_project, mode="offline", group=group_name) + else: + wandb.init( + project=opt.wandb_project, + entity=opt.wandb_entity, + config=dict(config), + group=group_name, + name=name_str, + resume='auto', + id=wandb_id, + ) + + +if __name__ == "__main__": + # custom parser to specify config files, train, test and debug mode, + # postfix, resume. + # `--key value` arguments are interpreted as arguments to the trainer. + # `nested.key=value` arguments are interpreted as config parameters. + # configs are merged from left-to-right followed by command line parameters. + + # model: + # base_learning_rate: float + # target: path to lightning module + # params: + # key: value + # data: + # target: main.DataModuleFromConfig + # params: + # batch_size: int + # wrap: bool + # train: + # target: path to train dataset + # params: + # key: value + # validation: + # target: path to validation dataset + # params: + # key: value + # test: + # target: path to test dataset + # params: + # key: value + # lightning: (optional, has sane defaults and can be specified on cmdline) + # trainer: + # additional arguments to trainer + # logger: + # logger to instantiate + # modelcheckpoint: + # modelcheckpoint to instantiate + # callbacks: + # callback1: + # target: importpath + # params: + # key: value + + now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + + # add cwd for convenience and to make classes in this file available when + # running as `python main.py` + # (in particular `main.DataModuleFromConfig`) + sys.path.append(os.getcwd()) + + parser = get_parser() + + opt, unknown = parser.parse_known_args() + + if opt.name and opt.resume: + raise ValueError( + "-n/--name and -r/--resume cannot be specified both." + "If you want to resume training in a new log folder, " + "use -n/--name in combination with --resume_from_checkpoint" + ) + melk_ckpt_name = None + name = None + if opt.resume: + if not os.path.exists(opt.resume): + raise ValueError("Cannot find {}".format(opt.resume)) + if os.path.isfile(opt.resume): + paths = opt.resume.split("/") + # idx = len(paths)-paths[::-1].index("logs")+1 + # logdir = "/".join(paths[:idx]) + logdir = "/".join(paths[:-2]) + ckpt = opt.resume + _, melk_ckpt_name = get_checkpoint_name(logdir) + else: + assert os.path.isdir(opt.resume), opt.resume + logdir = opt.resume.rstrip("/") + ckpt, melk_ckpt_name = get_checkpoint_name(logdir) + + print0("-" * 80) + print0(f'[bold red][main][/bold red] Resuming from checkpoint "{ckpt}"') + + opt.resume_from_checkpoint = ckpt + base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml"))) + opt.base = base_configs + opt.base + _tmp = logdir.split("/") + nowname = _tmp[-1] + else: + if opt.name: + name = "_" + opt.name + elif opt.base: + if opt.no_base_name: + name = "" + else: + if opt.legacy_naming: + cfg_fname = os.path.split(opt.base[0])[-1] + cfg_name = os.path.splitext(cfg_fname)[0] + else: + assert "configs" in os.path.split(opt.base[0])[0], os.path.split( + opt.base[0] + )[0] + cfg_path = os.path.split(opt.base[0])[0].split(os.sep)[ + os.path.split(opt.base[0])[0].split(os.sep).index("configs") + + 1 : + ] # cut away the first one (we assert all configs are in "configs") + cfg_name = os.path.splitext(os.path.split(opt.base[0])[-1])[0] + cfg_name = "-".join(cfg_path) + f"-{cfg_name}" + name = "_" + cfg_name + else: + name = "" + # automatic resume last checkpoint if available + if os.path.exists(opt.logdir): + auto_resumed = False + for sub_dir in sorted(os.listdir(opt.logdir)): + if sub_dir.endswith(name + opt.postfix): + ## checkpoint resume + if opt.checkpoint_auto_resume and not opt.debug: + checkpoint_dir = os.path.join(opt.logdir, sub_dir, "checkpoints") + # Use the max step checkpoint file + ckpt_files1 = glob.glob(os.path.join(checkpoint_dir, "*/*.ckpt")) + ckpt_files2 = glob.glob(os.path.join(checkpoint_dir, "*.ckpt")) + ckpt_files = ckpt_files1 + ckpt_files2 + ckpt_files.sort(key=get_step_value, reverse=True) + if ckpt_files: + ckpt = ckpt_files[0] + else: + # If no checkpoint files found, use a random initialized model + ckpt = None + if ckpt is not None and os.path.isfile(ckpt): + opt.resume_from_checkpoint = ckpt + auto_resumed = True + # print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Find previous log dir and checkpoint: {ckpt}") + ## wandb resume + if opt.wandb_auto_resume: + wandb_dir = Path(os.path.join(opt.logdir, sub_dir)) / "wandb" + if wandb_dir.exists() and any((wandb_dir / "latest-run").iterdir()): + # Parse unique `run_id` from the `.wandb.` file... + wandb_fns = [f.name for f in (wandb_dir / "latest-run").iterdir() if f.name.endswith(".wandb")] + assert len(wandb_fns) == 1, f"There should only be 1 `.wandb.` file... found {len(wandb_fns)}!" + # Regex Match on `run-{id}.wandb` + opt.wandb_id = re.search("run-(.+?).wandb", wandb_fns[0]).group(1) + # print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Find previous wandb run id: {opt.wandb_id}") + if auto_resumed: + print0(f"[bold red]\[main][/bold red] Auto-resuming from checkpoint: {opt.resume_from_checkpoint} and wandb id: {opt.wandb_id}") + ckpt_basename = os.path.basename(opt.resume_from_checkpoint) + seed_str = ''.join(re.findall(r'\d+', ckpt_basename)) + if len(seed_str) > 0: + opt.seed = int(seed_str) + print0(f"[bold red]\[main][/bold red] Auto-reseting seed to {opt.seed} from checkpoint name") + + if not opt.no_date: + nowname = now + name + opt.postfix + else: + nowname = name + opt.postfix + if nowname.startswith("_"): + nowname = nowname[1:] + logdir = os.path.join(opt.logdir, nowname) + print0(f"[bold red]\[main][/bold red] LOGDIR: {logdir}") + + ckptdir = os.path.join(logdir, "checkpoints") + cfgdir = os.path.join(logdir, "configs") + if not opt.seed_rank: + seed_everything(opt.seed, workers=True) # torch.initial_seed() + + # move before model init, in case a torch.compile(...) is called somewhere + if opt.enable_tf32: + # pt_version = version.parse(torch.__version__) + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + print0(f"[bold red]\[main][/bold red] Enabling TF32 for PyTorch {torch.__version__}") + else: + print0(f"[bold red]\[main][/bold red] Using default TF32 settings for PyTorch {torch.__version__}:") + print0(f"[bold red]\[main][/bold red] torch.backends.cuda.matmul.allow_tf32={torch.backends.cuda.matmul.allow_tf32}") + print0(f"[bold red]\[main][/bold red] torch.backends.cudnn.allow_tf32={torch.backends.cudnn.allow_tf32}") + + try: + # init and save configs + configs = [OmegaConf.load(cfg) for cfg in opt.base] + # deal with the unknown args, e.g., --model.base_learning_rate=1.0e-4 + for i, u in enumerate(unknown): + if u.startswith("--"): + unknown[i] = u[2:] + # merge all configs and cli args + cli = OmegaConf.from_dotlist(unknown) + config = OmegaConf.merge(*configs, cli) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Merged input config: {config}") + lightning_config = config.pop("lightning", OmegaConf.create()) + # merge trainer cli with config + trainer_config = lightning_config.get("trainer", OmegaConf.create()) + + # debug: default to one node + if opt.debug: + trainer_config["num_nodes"] = 1 + + # default profiler + trainer_config["profiler"] = None if not opt.debug else "simple" + + # default to gpu + trainer_config["accelerator"] = "gpu" + # + standard_args = default_trainer_args() + for k in standard_args: + if getattr(opt, k) != standard_args[k]: + trainer_config[k] = getattr(opt, k) + + if not "devices" in trainer_config and trainer_config["accelerator"] != "gpu": + del trainer_config["accelerator"] + cpu = True + else: + gpuinfo = trainer_config["devices"] + print0(f"[bold red]\[main][/bold red] Running on {gpuinfo} GPUs") + cpu = False + trainer_opt = argparse.Namespace(**trainer_config) + lightning_config.trainer = trainer_config + + # model + model = instantiate_from_config(config.model) + + # trainer and callbacks + trainer_kwargs = dict() + + # default logger configs + default_logger_cfgs = { + "wandb": { + "target": "lightning.pytorch.loggers.WandbLogger", + "params": { + "name": nowname, + "save_dir": logdir, + "offline": opt.debug, + "id": nowname, + "project": opt.wandb_project, + "log_model": False, + "entity": opt.wandb_entity, + }, + }, + "csv": { + "target": "lightning.pytorch.loggers.CSVLogger", + "params": { + "name": "testtube", # hack for sbord fanatics + "save_dir": logdir, + }, + }, + "tensorboard": { + "target": "lightning.pytorch.loggers.TensorBoardLogger", + "params": { + "save_dir": logdir, + "name": 'tensorboard', + "version": nowname, + } + }, + } + default_logger_cfg = default_logger_cfgs["wandb" if opt.wandb else "tensorboard"] + if opt.wandb: + # change once leaving "swiffer" config directory + try: + group_name = nowname.split(now)[-1].split("-")[1] + except: + group_name = nowname + default_logger_cfg["params"]["group"] = group_name + + wandb_save_dir = os.path.join(os.getcwd(), logdir) + os.environ["WANDB_DIR"] = wandb_save_dir + + init_wandb( + wandb_save_dir, + opt=opt, + group_name=group_name, + config=config, + name_str=nowname, + ) + if "logger" in lightning_config: + logger_cfg = lightning_config.logger + else: + logger_cfg = OmegaConf.create() + logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg) + trainer_kwargs["logger"] = instantiate_from_config(logger_cfg) + + ckpt_resume_path = opt.resume_from_checkpoint + + # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to + # specify which metric is used to determine best models + default_modelckpt_cfg = { + "target": "lightning.pytorch.callbacks.ModelCheckpoint", + "params": { + "dirpath": ckptdir, + "filename": "{epoch:04}-{step:08}", # "epoch={epoch:06}-step={step:07}" + "verbose": True, + "save_last": True, + "auto_insert_metric_name": True, + }, + } + if hasattr(model, "monitor"): + print0(f"[bold red]\[main][/bold red] Monitoring {model.monitor} as checkpoint metric.") + default_modelckpt_cfg["params"]["monitor"] = model.monitor + default_modelckpt_cfg["params"]["save_top_k"] = 3 + + if "modelcheckpoint" in lightning_config: + modelckpt_cfg = lightning_config.modelcheckpoint + else: + modelckpt_cfg = OmegaConf.create() + modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Merged modelckpt-cfg: {modelckpt_cfg}") + + # https://pytorch-lightning.readthedocs.io/en/stable/extensions/strategy.html + # default to ddp if not further specified + default_strategy_config = {"target": "lightning.pytorch.strategies.DDPStrategy"} + + if "strategy" in lightning_config: + strategy_cfg = lightning_config.strategy + else: + strategy_cfg = OmegaConf.create() + default_strategy_config["params"] = { + "find_unused_parameters": False, + # "static_graph": True, + # "ddp_comm_hook": default.fp16_compress_hook # experiment with this, also for DDPSharded + } + strategy_cfg = OmegaConf.merge(default_strategy_config, strategy_cfg) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] strategy config: {strategy_cfg}") + trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg) + if hasattr(trainer_kwargs["strategy"], "_timeout"): + trainer_kwargs["strategy"]._timeout = datetime.timedelta(seconds=5400) # 3600s = 1h + + # add callback which sets up log directory + default_callbacks_cfg = { + "setup_callback": { + "target": "main.SetupCallback", + "params": { + "resume": opt.resume, + "now": now, + "logdir": logdir, + "ckptdir": ckptdir, + "cfgdir": cfgdir, + "config": config, + "lightning_config": lightning_config, + "debug": opt.debug, + "ckpt_name": melk_ckpt_name, + "seed": opt.seed, + "seed_rank": opt.seed_rank + }, + }, + "image_logger": { + "target": "main.ImageLogger", + "params": {"batch_frequency": 1000, "max_samples": 4, "clamp": True}, + }, + "learning_rate_logger": { + "target": "lightning.pytorch.callbacks.LearningRateMonitor", + "params": { + "logging_interval": "step", + # "log_momentum": True + }, + }, + } + if version.parse(pl.__version__) >= version.parse("1.4.0"): + default_callbacks_cfg.update({"checkpoint_callback": modelckpt_cfg}) + + if "callbacks" in lightning_config: + callbacks_cfg = lightning_config.callbacks + else: + callbacks_cfg = OmegaConf.create() + + if "metrics_over_trainsteps_checkpoint" in callbacks_cfg: + print0( + "[bold red]\[main][/bold red] Caution: Saving checkpoints every n train steps without deleting. This might require some free space." + ) + default_metrics_over_trainsteps_ckpt_dict = { + "metrics_over_trainsteps_checkpoint": { + "target": "lightning.pytorch.callbacks.ModelCheckpoint", + "params": { + "dirpath": os.path.join(ckptdir, "trainstep_checkpoints"), + "filename": "{epoch:04}-{step:08}", # "{epoch:06}-{step:09}" + "verbose": True, + "save_top_k": -1, + "every_n_train_steps": 10000, + "save_weights_only": True, + }, + } + } + default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict) + + callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg) + if "ignore_keys_callback" in callbacks_cfg and ckpt_resume_path is not None: + callbacks_cfg.ignore_keys_callback.params["ckpt_path"] = ckpt_resume_path + elif "ignore_keys_callback" in callbacks_cfg: + del callbacks_cfg["ignore_keys_callback"] + + trainer_kwargs["callbacks"] = [ + instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg + ] + if not "plugins" in trainer_kwargs: + trainer_kwargs["plugins"] = list() + + # cmd line trainer args (which are in trainer_opt) have always priority over config-trainer-args (which are in trainer_kwargs) + trainer_opt = vars(trainer_opt) + trainer_kwargs = { + key: val for key, val in trainer_kwargs.items() if key not in trainer_opt + } + trainer = Trainer(**trainer_opt, **trainer_kwargs) + + trainer.logdir = logdir + + # data + if ((not opt.train) or opt.debug) and hasattr(config.data.params, "validation"): + config.data.params.train = config.data.params.validation + print0("[bold red]\[main][/bold red] Using validation data as training data for fast loading.") + data = instantiate_from_config(config.data) + # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + # calling these ourselves should not be necessary but it is. + # lightning still takes care of proper multiprocessing though + data.prepare_data() + # data.setup() + try: + for k in data.datasets: + print0( + f"[bold red]\[main][/bold red] {k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}" + ) + except: + print0("[bold red]\[main][/bold red] datasets not yet initialized.") + + # configure learning rate + if "batch_size" in config.data.params: + bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate + else: + bs, base_lr = ( + config.data.params.train.loader.batch_size, + config.model.base_learning_rate, + ) + if not cpu: + # add for different device input type + if isinstance(lightning_config.trainer.devices, int): + ngpu = lightning_config.trainer.devices + elif isinstance(lightning_config.trainer.devices, list): + ngpu = len(lightning_config.trainer.devices) + elif isinstance(lightning_config.trainer.devices, str): + ngpu = len(lightning_config.trainer.devices.strip(",").split(",")) + else: + ngpu = 1 + if "accumulate_grad_batches" in lightning_config.trainer: + accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches + else: + accumulate_grad_batches = 1 + print0(f"[bold red]\[main][/bold red] accumulate_grad_batches = {accumulate_grad_batches}") + lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches + + if opt.scale_lr: + model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr + print0( + "[bold red]\[main][/bold red] Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format( + model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr + ) + ) + else: + model.learning_rate = base_lr + print0("[bold red]\[main][/bold red] NOT using learning rate scaling") + print0(f"[bold red]\[main][/bold red] Setting learning rate to {model.learning_rate:.2e}") + + # allow checkpointing via USR1 + def melk(*args, **kwargs): + # run all checkpoint hooks + if trainer.global_rank == 0: + melkdir = os.path.join(logdir, "melk") + os.makedirs(melkdir, exist_ok=True) + print0(f"[bold red]\[main][/bold red] Saving checkpoint to {melkdir}") + if melk_ckpt_name is None: + ckpt_path = os.path.join(melkdir, "last.ckpt") + else: + ckpt_path = os.path.join(melkdir, melk_ckpt_name) + trainer.save_checkpoint(ckpt_path) + + def divein(*args, **kwargs): + if trainer.global_rank == 0: + import pudb + pudb.set_trace() + + import signal + signal.signal(signal.SIGUSR1, melk) + signal.signal(signal.SIGUSR2, divein) + + # run + if opt.train: + try: + trainer.fit(model, data, ckpt_path=ckpt_resume_path) + print0(f"[bold red]\[main][/bold red] Finish training with logdir: {logdir}") + except Exception as e: + print(f"") + print(f"[bold red]\[main][/bold red] Exception: {e}") + print(f"[bold red]\[main][/bold red] Beijing Time {datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai'))}") + if not opt.debug: + melk() + raise + else: + trainer.validate(model, data, ckpt_path=ckpt_resume_path) + exit() + if not opt.no_test and not trainer.interrupted: + trainer.test(model, data) + except RuntimeError as err: + if MULTINODE_HACKS: + import datetime + import os + import socket + import requests + + device = os.environ.get("CUDA_VISIBLE_DEVICES", "?") + hostname = socket.gethostname() + ts = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + resp = requests.get("http://169.254.169.254/latest/meta-data/instance-id") + print( + f"[bold red]\[main][/bold red] ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}", + flush=True, + ) + raise err + except Exception: + if opt.debug and trainer.global_rank == 0: + try: + import pudb as debugger + except ImportError: + import pdb as debugger + # debugger.post_mortem() + raise + finally: + # move newly created debug project to debug_runs + if opt.debug and not opt.resume and trainer.global_rank == 0: + dst, name = os.path.split(logdir) + dst = os.path.join(dst, "debug_runs", name) + os.makedirs(os.path.split(dst)[0], exist_ok=True) + os.rename(logdir, dst) + + if opt.wandb: + wandb.finish() + + # clean up + # dist.barrier() + # torch.cuda.empty_cache() + dist.destroy_process_group() + + if trainer.global_rank == 0 and opt.debug: + print0(f"[bold red]\[main][/bold red] Current logdir: {logdir}") + # print0(f"[bold red]\[main][/bold red] Profiler summary:") + # print(trainer.profiler.summary()) + print0(f"[bold red]\[main][/bold red] Memory summary:") + num_params = sum([p.numel() for p in model.parameters()]) + print0(f"[bold red]\[main][/bold red] Expected bf16 memory usage from params: {num_params * 2 / 1e9:.2f} GB") + print0(f"[bold red]\[main][/bold red] Current memory usage with model on device {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") + # trainer.print(torch.cuda.memory_summary()) diff --git a/Meissonic/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc b/Meissonic/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9dc3ee2b98d3f51e021fb953effcd638ef0ca8ec Binary files /dev/null and b/Meissonic/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc differ diff --git a/Meissonic/VidTok/scripts/inference_evaluate.py b/Meissonic/VidTok/scripts/inference_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..d4152046727305af7b32e0a3e5728072b7085ca4 --- /dev/null +++ b/Meissonic/VidTok/scripts/inference_evaluate.py @@ -0,0 +1,198 @@ +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import warnings +warnings.filterwarnings("ignore") + +import time +from contextlib import nullcontext +from omegaconf import OmegaConf +from torch import autocast +from tqdm import tqdm + +import numpy as np +import torch +from einops import rearrange +from lightning.pytorch import seed_everything + +from vidtok.data.vidtok import VidTokValDataset +from vidtok.modules.lpips import LPIPS +from vidtok.modules.util import (compute_psnr, compute_ssim, + instantiate_from_config, print0) + + +def load_model_from_config(config, ckpt, ignore_keys=[], verbose=False): + config = OmegaConf.load(config) + config.model.params.ckpt_path = ckpt + config.model.params.ignore_keys = ignore_keys + config.model.params.verbose = verbose + model = instantiate_from_config(config.model) + return model + + +class MultiVideoDataset(VidTokValDataset): + def __init__( + self, + data_dir, + meta_path=None, + input_height=256, + input_width=256, + sample_fps=30, + chunk_size=16, + is_causal=True, + read_long_video=False + ): + super().__init__( + data_dir=data_dir, + meta_path=meta_path, + video_params={ + "input_height": input_height, + "input_width": input_width, + "sample_num_frames": chunk_size + 1 if is_causal else chunk_size, + "sample_fps": sample_fps, + }, + pre_load_frames=True, + last_frames_handle="repeat", + read_long_video=read_long_video, + chunk_size=chunk_size, + is_causal=is_causal, + ) + + def __getitem__(self, idx): + frames = super().__getitem__(idx)["jpg"] + return frames + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--data_dir", + type=str, + default="./", + help="root folder", + ) + parser.add_argument( + "--meta_path", + type=str, + default=None, + help="path to the .csv meta file", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=16, + help="the size of a chunk - we split a long video into several chunks", + ) + parser.add_argument( + "--read_long_video", + action='store_true' + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[scripts.inference_evaluate][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + assert args.chunk_size % model.encoder.time_downsample_factor == 0 + + + if args.read_long_video: + assert hasattr(model, 'use_tiling'), "Tiling inference is needed to conduct long video reconstruction." + print(f"Using tiling inference to save memory usage...") + model.enable_tiling() + model.t_chunk_enc = args.chunk_size + model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor + + if args.input_width > 256: + model.enable_tiling() + + dataset = MultiVideoDataset( + data_dir=args.data_dir, + meta_path=args.meta_path, + input_height=args.input_height, + input_width=args.input_width, + sample_fps=args.sample_fps, + chunk_size=args.chunk_size, + is_causal=model.is_causal, + read_long_video=args.read_long_video + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + perceptual_loss = LPIPS().eval() + perceptual_loss = perceptual_loss.to(device) + + psnrs, ssims, lpipss = [], [], [] + + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, output, reg_log = model(input) + output = output.clamp(-1, 1) + input, output = map(lambda x: (x + 1) / 2, (input, output)) + + if input.dim() == 5: + input = rearrange(input, "b c t h w -> (b t) c h w") + assert output.dim() == 5 + output = rearrange(output, "b c t h w -> (b t) c h w") + + for inp, out in zip(torch.split(input, 16), torch.split(output, 16)): + psnrs += [compute_psnr(inp, out).item()] * inp.shape[0] + ssims += [compute_ssim(inp, out).item()] * inp.shape[0] + lpipss += [perceptual_loss(inp * 2 - 1, out * 2 - 1).mean().item()] * inp.shape[0] + + toc = time.time() + print0( + f"[bold red]\[scripts.inference_evaluate][/bold red] PSNR: {np.mean(psnrs):.4f}, SSIM: {np.mean(ssims):.4f}, LPIPS: {np.mean(lpipss):.4f}" + ) + print0(f"[bold red]\[scripts.inference_evaluate][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/scripts/inference_reconstruct.py b/Meissonic/VidTok/scripts/inference_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..3a26b9475339f6675e01fd052637c8465ca37caf --- /dev/null +++ b/Meissonic/VidTok/scripts/inference_reconstruct.py @@ -0,0 +1,246 @@ +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import warnings +warnings.filterwarnings("ignore") + +import time +from contextlib import nullcontext +from omegaconf import OmegaConf +from pathlib import Path +from tqdm import tqdm + +import numpy as np +import torch +import decord +from einops import rearrange +from lightning.pytorch import seed_everything +from torch import autocast +from torchvision import transforms +from torchvision.io import write_video + +from vidtok.modules.util import print0 +from scripts.inference_evaluate import load_model_from_config + + +class SingleVideoDataset(torch.utils.data.Dataset): + def __init__( + self, + video_path, + input_height=128, + input_width=128, + sample_fps=8, + chunk_size=16, + is_causal=True, + read_long_video=False + ): + decord.bridge.set_bridge("torch") + self.video_path = video_path + self.transform = transforms.Compose( + [ + transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + ] + ) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + if read_long_video: + video_length = len(frame_ids) + if is_causal and video_length > chunk_size: + self.frame_ids_batch.append(frame_ids[:chunk_size * ((video_length - 1) // chunk_size) + 1]) + elif not is_causal and video_length >= chunk_size: + self.frame_ids_batch.append(frame_ids[:chunk_size * (video_length // chunk_size)]) + else: + num_frames_per_batch = chunk_size + 1 if is_causal else chunk_size + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x : x + num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x : x + num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255.0 + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=16, + help="the size of a chunk - we split a long video into several chunks", + ) + parser.add_argument( + "--read_long_video", + action='store_true' + ) + parser.add_argument( + "--pad_gen_frames", + action="store_true", + help="Used only in causal mode. If True, pad frames generated in the last batch, else replicate the first frame instead", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[scripts.inference_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + config = OmegaConf.load(args.config) + + os.makedirs(args.output_video_dir, exist_ok=True) + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + assert args.chunk_size % model.encoder.time_downsample_factor == 0 + + if args.read_long_video: + assert hasattr(model, 'use_tiling'), "Tiling inference is needed to conduct long video reconstruction." + print(f"Using tiling inference to save memory usage...") + model.use_tiling = True + model.t_chunk_enc = args.chunk_size + model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor + model.use_overlap = True + + dataset = SingleVideoDataset( + video_path=args.input_video_path, + input_height=args.input_height, + input_width=args.input_width, + sample_fps=args.sample_fps, + chunk_size=args.chunk_size, + is_causal=model.is_causal, + read_long_video=args.read_long_video + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + inputs = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + + if model.is_causal and not args.read_long_video and args.pad_gen_frames: + if i == 0: + _, xrec, _ = model(input) + else: + _, xrec, _ = model(torch.cat([last_gen_frames, input], dim=2)) + xrec = xrec[:, :, -input.shape[2]:].clamp(-1, 1) + last_gen_frames = xrec[:, :, (1 - model.encoder.time_downsample_factor):, :, :] + else: + _, xrec, _ = model(input) + + input = rearrange(input, "b c t h w -> (b t) c h w") + inputs.append(input) + xrec = rearrange(xrec.clamp(-1, 1), "b c t h w -> (b t) c h w") + outputs.append(xrec) + + toc = time.time() + + # save the outputs as videos + inputs = tensor_to_uint8(torch.cat(inputs, dim=0)) + inputs = rearrange(inputs, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs.shape[0], outputs.shape[0]) + final = np.concatenate([inputs[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"{Path(args.input_video_path).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + + print0(f"[bold red]Results saved in: {output_video_path}[/bold red]") + print0(f"[bold red]\[scripts.inference_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/test_vidtok.py b/Meissonic/VidTok/test_vidtok.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c82d5b7d1ff85d8713321354841413318ba6a5 --- /dev/null +++ b/Meissonic/VidTok/test_vidtok.py @@ -0,0 +1,758 @@ +#!/usr/bin/env python3 +""" +Test script for VidTok tokenizer performance. + +This script: +1. Loads a video from the training dataset +2. Encodes it using VidTok tokenizer +3. Decodes it back +4. Computes metrics (PSNR, SSIM, MSE) +5. Creates a side-by-side comparison video +6. Saves the results + +Based on VidTok: https://github.com/microsoft/VidTok +""" + +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import torch +import numpy as np +from PIL import Image +import cv2 +from torchvision import transforms +from torchvision.utils import make_grid, save_image + +# VidTok imports - adjust path if needed +VIDTOK_AVAILABLE = False +VIDTOK_PATH = None + +def _setup_vidtok(): + """Setup VidTok by trying to import or download from GitHub.""" + global VIDTOK_AVAILABLE, VIDTOK_PATH + + # Try to import from existing installation + try: + from scripts.inference_evaluate import load_model_from_config + VIDTOK_AVAILABLE = True + return load_model_from_config + except ImportError: + pass + + # Try to find VidTok in common locations + vidtok_paths = [ + "VidTok", + "../VidTok", + os.path.join(os.path.dirname(__file__), "../VidTok"), + os.path.expanduser("~/VidTok"), + ] + + for vidtok_path in vidtok_paths: + if os.path.exists(vidtok_path) and os.path.exists(os.path.join(vidtok_path, "scripts")): + sys.path.insert(0, vidtok_path) + try: + from scripts.inference_evaluate import load_model_from_config + VIDTOK_AVAILABLE = True + VIDTOK_PATH = vidtok_path + print(f"Found VidTok at: {vidtok_path}") + return load_model_from_config + except ImportError: + if vidtok_path in sys.path: + sys.path.remove(vidtok_path) + continue + + # Try to download from GitHub + print("VidTok not found locally. Attempting to download from GitHub...") + try: + import subprocess + import tempfile + + # Create cache directory + cache_dir = os.path.join(os.getcwd(), "vidtok_cache") + vidtok_dir = os.path.join(cache_dir, "VidTok") + + # Check if already downloaded + if os.path.exists(vidtok_dir) and os.path.exists(os.path.join(vidtok_dir, "scripts")): + sys.path.insert(0, vidtok_dir) + try: + from scripts.inference_evaluate import load_model_from_config + VIDTOK_AVAILABLE = True + VIDTOK_PATH = vidtok_dir + print(f"Using cached VidTok from: {vidtok_dir}") + return load_model_from_config + except ImportError: + if vidtok_dir in sys.path: + sys.path.remove(vidtok_dir) + + # Download from GitHub + print("Downloading VidTok from GitHub...") + os.makedirs(cache_dir, exist_ok=True) + + # Use git clone if available, otherwise download zip + if subprocess.run(["which", "git"], capture_output=True).returncode == 0: + # Clone repository + if os.path.exists(vidtok_dir): + import shutil + shutil.rmtree(vidtok_dir) + + result = subprocess.run( + ["git", "clone", "--depth", "1", "https://github.com/microsoft/VidTok.git", vidtok_dir], + capture_output=True, + text=True + ) + if result.returncode == 0: + sys.path.insert(0, vidtok_dir) + try: + from scripts.inference_evaluate import load_model_from_config + VIDTOK_AVAILABLE = True + VIDTOK_PATH = vidtok_dir + print(f"Successfully downloaded VidTok to: {vidtok_dir}") + return load_model_from_config + except ImportError as e: + if vidtok_dir in sys.path: + sys.path.remove(vidtok_dir) + print(f"Failed to import VidTok after download: {e}") + else: + # Fallback: download zip file + print("Git not available, trying to download zip file...") + import urllib.request + import zipfile + + zip_url = "https://github.com/microsoft/VidTok/archive/refs/heads/main.zip" + zip_path = os.path.join(cache_dir, "VidTok-main.zip") + + print(f"Downloading {zip_url}...") + urllib.request.urlretrieve(zip_url, zip_path) + + # Extract zip + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(cache_dir) + + # Rename extracted directory + extracted_dir = os.path.join(cache_dir, "VidTok-main") + if os.path.exists(vidtok_dir): + import shutil + shutil.rmtree(vidtok_dir) + os.rename(extracted_dir, vidtok_dir) + + sys.path.insert(0, vidtok_dir) + try: + from scripts.inference_evaluate import load_model_from_config + VIDTOK_AVAILABLE = True + VIDTOK_PATH = vidtok_dir + print(f"Successfully downloaded VidTok to: {vidtok_dir}") + return load_model_from_config + except ImportError as e: + if vidtok_dir in sys.path: + sys.path.remove(vidtok_dir) + print(f"Failed to import VidTok after download: {e}") + + except Exception as e: + print(f"Failed to download VidTok: {e}") + + return None + +# Setup VidTok +load_model_from_config = _setup_vidtok() +sys.path.append("/mnt/Meissonic") + + +from train.dataset_utils import OpenVid1MDataset +from transformers import T5Tokenizer + + + + +def calculate_psnr(img1, img2, max_val=1.0): + """Calculate PSNR between two images.""" + # Ensure both tensors are on the same device (preferably CPU for metric calculation) + if img1.device != img2.device: + img1 = img1.to(img2.device) + # Move to CPU for metric calculation to avoid GPU memory issues + img1 = img1.cpu() + img2 = img2.cpu() + + mse = torch.mean((img1 - img2) ** 2) + if mse == 0: + return float('inf') + psnr = 20 * torch.log10(max_val / torch.sqrt(mse)) + return psnr.item() + + +def calculate_mse(img1, img2): + """Calculate MSE between two images.""" + # Ensure both tensors are on the same device (preferably CPU for metric calculation) + if img1.device != img2.device: + img1 = img1.to(img2.device) + # Move to CPU for metric calculation to avoid GPU memory issues + img1 = img1.cpu() + img2 = img2.cpu() + + return torch.mean((img1 - img2) ** 2).item() + + +def calculate_ssim(img1, img2, window_size=11): + """Calculate SSIM between two images (simplified version).""" + # Ensure both tensors are on the same device (preferably CPU for metric calculation) + if img1.device != img2.device: + img1 = img1.to(img2.device) + # Move to CPU for metric calculation to avoid GPU memory issues + img1 = img1.cpu() + img2 = img2.cpu() + + # Simple SSIM approximation + C1 = 0.01 ** 2 + C2 = 0.03 ** 2 + + mu1 = img1.mean() + mu2 = img2.mean() + + sigma1_sq = img1.var() + sigma2_sq = img2.var() + sigma12 = ((img1 - mu1) * (img2 - mu2)).mean() + + ssim = ((2 * mu1 * mu2 + C1) * (2 * sigma12 + C2)) / ((mu1**2 + mu2**2 + C1) * (sigma1_sq + sigma2_sq + C2)) + return ssim.item() + + +def video_to_numpy(video_tensor): + """ + Convert video tensor [C, F, H, W] in [-1, 1] or [0, 1] to numpy array [F, H, W, C] in [0, 255] (RGB). + VidTok uses [-1, 1] range. + """ + if isinstance(video_tensor, torch.Tensor): + # [C, F, H, W] -> [F, C, H, W] -> [F, H, W, C] + video_np = video_tensor.permute(1, 0, 2, 3).cpu().numpy() # [F, C, H, W] + video_np = np.transpose(video_np, (0, 2, 3, 1)) # [F, H, W, C] + + # Normalize to [0, 1] range (VidTok uses [-1, 1]) + if video_np.min() < 0: + # Assume range is [-1, 1] + video_np = (video_np + 1) / 2 + else: + # Assume range is [0, 1] + video_np = np.clip(video_np, 0, 1) + + # Convert to [0, 255] + video_np = (video_np * 255).astype(np.uint8) + else: + video_np = np.array(video_tensor) + return video_np + + +def add_text_to_image(image_tensor, text, position=(10, 30)): + """ + Add text label to an image tensor. + + Args: + image_tensor: Image tensor [C, H, W] in [0, 1] or [-1, 1] + text: Text to add + position: (x, y) position for text + Returns: + Image tensor with text [C, H, W] + """ + # Normalize to [0, 1] if needed + img_norm = image_tensor.clone() + if img_norm.min() < 0: + img_norm = (img_norm + 1) / 2 + img_norm = torch.clamp(img_norm, 0, 1) + + # Convert to PIL Image + image_np = img_norm.permute(1, 2, 0).cpu().numpy() # [H, W, C] + image_np = (image_np * 255).astype(np.uint8) + pil_image = Image.fromarray(image_np) + + # Add text + from PIL import ImageDraw, ImageFont + draw = ImageDraw.Draw(pil_image) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 24) + except: + try: + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) + except: + font = ImageFont.load_default() + + # Draw white text with black outline + x, y = position + # Draw outline + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + # Draw main text + draw.text((x, y), text, font=font, fill=(255, 255, 255)) + + # Convert back to tensor (normalize to [0, 1]) + image_tensor = transforms.ToTensor()(pil_image) + return image_tensor + + +def create_side_by_side_video(original, reconstructed, output_path, fps=8): + """ + Create a side-by-side comparison video. + + Args: + original: Original video tensor [C, F, H, W] in [0, 1] or [-1, 1] + reconstructed: Reconstructed video tensor [C, F, H, W] in [0, 1] or [-1, 1] + output_path: Path to save the output video + fps: Frames per second + """ + # Convert to numpy (RGB format: [F, H, W, C]) + orig_np = video_to_numpy(original) + recon_np = video_to_numpy(reconstructed) + + # Get dimensions + F, H, W, C = orig_np.shape + F_recon, H_recon, W_recon, C_recon = recon_np.shape + + # Ensure same number of frames + F_min = min(F, F_recon) + orig_np = orig_np[:F_min] + recon_np = recon_np[:F_min] + + # Resize if dimensions don't match + if H != H_recon or W != W_recon: + print(f"Resizing reconstructed video from ({H_recon}, {W_recon}) to ({H}, {W})") + recon_np_resized = np.zeros((F_min, H, W, C), dtype=np.uint8) + for f in range(F_min): + recon_np_resized[f] = cv2.resize(recon_np[f], (W, H), interpolation=cv2.INTER_LINEAR) + recon_np = recon_np_resized + + # Add text labels to frames + from PIL import Image, ImageDraw, ImageFont + side_by_side_frames = [] + for f in range(F_min): + # Original frame with label + orig_frame_pil = Image.fromarray(orig_np[f]) + draw = ImageDraw.Draw(orig_frame_pil) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 32) + except: + try: + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 32) + except: + font = ImageFont.load_default() + # Draw text with outline for visibility + text = "Original" + x, y = 20, 20 + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + draw.text((x, y), text, font=font, fill=(255, 255, 255)) + orig_frame = np.array(orig_frame_pil) + + # Reconstructed frame with label + recon_frame_pil = Image.fromarray(recon_np[f]) + draw = ImageDraw.Draw(recon_frame_pil) + text = "Reconstructed" + x, y = 20, 20 + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + draw.text((x, y), text, font=font, fill=(255, 255, 0)) # Yellow text + recon_frame = np.array(recon_frame_pil) + + # Concatenate horizontally + frame = np.concatenate([orig_frame, recon_frame], axis=1) + side_by_side_frames.append(frame) + + # Write video using OpenCV (needs BGR format) + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(output_path, fourcc, fps, (W * 2, H)) + + if not out.isOpened(): + print(f"Warning: Could not open video writer with mp4v codec, trying XVID...") + fourcc = cv2.VideoWriter_fourcc(*'XVID') + out = cv2.VideoWriter(output_path, fourcc, fps, (W * 2, H)) + + for frame in side_by_side_frames: + # Convert RGB to BGR for OpenCV + frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + out.write(frame_bgr) + + out.release() + print(f"Saved side-by-side video to: {output_path}") + + +def create_comparison_grid(original, reconstructed, output_path, nrow=4): + """ + Create a grid image comparing original and reconstructed frames. + + Args: + original: Original video tensor [C, F, H, W] in [0, 1] or [-1, 1] + reconstructed: Reconstructed video tensor [C, F, H, W] in [0, 1] or [-1, 1] + output_path: Path to save the grid image + nrow: Number of frames per row + """ + # Get number of frames + F = min(original.shape[1], reconstructed.shape[1]) + + # Select frames to display + num_frames_to_show = min(8, F) + frame_indices = np.linspace(0, F - 1, num_frames_to_show, dtype=int) + + frames_list = [] + for idx in frame_indices: + # Original frame with label + orig_frame = original[:, idx, :, :].clone() # [C, H, W] + orig_frame = add_text_to_image(orig_frame, "Original", position=(10, 10)) + frames_list.append(orig_frame) + + # Reconstructed frame with label + recon_frame = reconstructed[:, idx, :, :].clone() # [C, H, W] + recon_frame = add_text_to_image(recon_frame, "Reconstructed", position=(10, 10)) + frames_list.append(recon_frame) + + # Create grid + frames_tensor = torch.stack(frames_list, dim=0) + grid = make_grid(frames_tensor, nrow=nrow * 2, padding=2, pad_value=1.0) + + save_image(grid, output_path) + print(f"Saved comparison grid to: {output_path}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Test VidTok tokenizer performance") + + parser.add_argument( + "--config", + type=str, + default=None, + help="Path to VidTok config file (e.g., configs/vidtok_kl_causal_488_4chn.yaml). " + "If not provided, will try to download from HuggingFace." + ) + parser.add_argument( + "--ckpt", + type=str, + default=None, + help="Path to VidTok checkpoint file or HuggingFace model ID. " + "If HuggingFace model ID (e.g., microsoft/VidTok), will download automatically." + ) + parser.add_argument( + "--model_name", + type=str, + default="vidtok_kl_causal_488_4chn", + help="VidTok model name for HuggingFace download. " + "Options: vidtok_kl_causal_488_4chn, vidtok_kl_noncausal_488_4chn, " + "vidtok_fsq_causal_488_4096, etc. (default: vidtok_kl_causal_488_4chn)" + ) + parser.add_argument( + "--csv_path", + type=str, + required=True, + help="Path to OpenVid1M CSV file" + ) + parser.add_argument( + "--video_root_dir", + type=str, + default=None, + help="Root directory for videos (auto-detected if not provided)" + ) + parser.add_argument( + "--video_index", + type=int, + default=0, + help="Index of video to test (default: 0)" + ) + parser.add_argument( + "--num_frames", + type=int, + default=16, + help="Number of frames (use 17 for causal models, 16 for non-causal)" + ) + parser.add_argument( + "--height", + type=int, + default=256, + help="Video height" + ) + parser.add_argument( + "--width", + type=int, + default=256, + help="Video width" + ) + parser.add_argument( + "--output_dir", + type=str, + default="./vidtok_test_output", + help="Output directory for results" + ) + parser.add_argument( + "--device", + type=str, + default="cuda" if torch.cuda.is_available() else "cpu", + help="Device to use" + ) + parser.add_argument( + "--use_continuous", + action="store_true", + help="Use continuous latent space for decoding (default: use discrete tokens)" + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + # Create output directory + os.makedirs(args.output_dir, exist_ok=True) + + # Set device + device = torch.device(args.device) + print(f"Using device: {device}") + + # Determine config and checkpoint paths + config_path = args.config + ckpt_path = args.ckpt + + # If checkpoint is a HuggingFace model ID, download from HuggingFace + if ckpt_path is None or ckpt_path.startswith("microsoft/VidTok") or "/" in ckpt_path and not os.path.exists(ckpt_path): + print(f"Downloading VidTok model from HuggingFace...") + try: + from huggingface_hub import hf_hub_download, snapshot_download + import tempfile + + # Determine model ID + if ckpt_path and ckpt_path.startswith("microsoft/VidTok"): + repo_id = ckpt_path + else: + repo_id = "microsoft/VidTok" + + # Download checkpoint + checkpoint_filename = "vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt"#f"{args.model_name}.ckpt" + print(f"Downloading checkpoint: {checkpoint_filename}") + + # Create temporary directory for downloads + cache_dir = os.path.join(os.getcwd(), "vidtok_cache") + os.makedirs(cache_dir, exist_ok=True) + + # Download checkpoint + ckpt_path = hf_hub_download( + repo_id=repo_id, + filename=f"checkpoints/{checkpoint_filename}", + cache_dir=cache_dir, + local_dir=os.path.join(cache_dir, repo_id.replace("/", "_")), + ) + print(f"Downloaded checkpoint to: {ckpt_path}") + + # Download config if not provided + config_path = "/mnt/Meissonic/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml" #local path to config, no need to download + + except ImportError: + print("Error: huggingface_hub not installed. Install with: pip install huggingface_hub") + sys.exit(1) + except Exception as e: + print(f"Error downloading from HuggingFace: {e}") + print("Please provide --config and --ckpt paths, or install VidTok repository.") + sys.exit(1) + + # Load VidTok model + if not VIDTOK_AVAILABLE or load_model_from_config is None: + print("Error: VidTok scripts not available. Please install VidTok:") + print(" git clone https://github.com/microsoft/VidTok") + print(" export PYTHONPATH=\"${PYTHONPATH}:$(pwd)/VidTok\"") + print("\nOr ensure you have git installed for automatic download.") + sys.exit(1) + + print(f"Loading VidTok model from config: {config_path}") + print(f"Checkpoint: {ckpt_path}") + if VIDTOK_PATH: + print(f"Using VidTok from: {VIDTOK_PATH}") + model = load_model_from_config(config_path, ckpt_path) + model = model.to(device).eval() + # model.encoder = torch.compile(model.encoder) + # model.decoder = torch.compile(model.decoder) + + + # Check if model is causal + is_causal = getattr(model, 'is_causal', False) + print(f"Model is causal: {is_causal}") + if is_causal and args.num_frames == 16: + print("Warning: Causal models typically use 17 frames. Consider using --num_frames 17") + + # Load dataset + print(f"Loading dataset from: {args.csv_path}") + + # Auto-detect video_root_dir if not provided + video_root_dir = args.video_root_dir + if video_root_dir is None: + csv_dir = os.path.dirname(args.csv_path) + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + video_root_dir = csv_dir + print(f"Warning: Video directory not found, using CSV directory: {video_root_dir}") + + # Initialize tokenizer for dataset (needed for OpenVid1MDataset) + tokenizer = T5Tokenizer.from_pretrained("google/umt5-base") + + # Create dataset + dataset = OpenVid1MDataset( + csv_path=args.csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.height, + width=args.width, + text_encoder_architecture="umt5-base", + ) + + print(f"Dataset size: {len(dataset)}") + + # Load video + if args.video_index >= len(dataset): + print(f"Error: video_index {args.video_index} >= dataset size {len(dataset)}") + return + + print(f"Loading video at index {args.video_index}...") + sample = dataset[args.video_index] + original_video = sample["video"] # [C, F, H, W] in [0, 1] + + # Get video info from dataset + row = dataset.data[args.video_index] + video_path = row.get('video', 'unknown') + caption = row.get('caption', 'no caption') + + print(f"Video path: {video_path}") + print(f"Caption: {caption}") + print(f"Original video shape: {original_video.shape}") + print(f"Original video range: [{original_video.min():.3f}, {original_video.max():.3f}]") + + # Convert to VidTok format: [B, C, T, H, W] in [-1, 1] + # Original video is [C, F, H, W] in [0, 1] + original_video_vidtok = original_video.unsqueeze(0) # [1, C, F, H, W] + original_video_vidtok = original_video_vidtok.permute(0, 1, 2, 3, 4) # [1, C, F, H, W] -> [1, C, T, H, W] + # Convert from [0, 1] to [-1, 1] + original_video_vidtok = original_video_vidtok * 2.0 - 1.0 + original_video_vidtok = original_video_vidtok.to(device=device) + + print(f"VidTok input shape: {original_video_vidtok.shape}") + print(f"VidTok input range: [{original_video_vidtok.min():.3f}, {original_video_vidtok.max():.3f}]") + + # Encode + print("\nEncoding video...") + with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu', dtype=torch.float16 if device.type == 'cuda' else torch.float32): + if args.use_continuous: + # Encode to continuous latent space + z, reg_log = model.encode(original_video_vidtok, return_reg_log=True) + print(f"Continuous latent shape: {z.shape}") + print(f"Discrete tokens shape: {reg_log['indices'].shape if 'indices' in reg_log else 'N/A'}") + else: + # Full forward pass to get reconstruction + _, reconstructed_video_vidtok, _ = model(original_video_vidtok) + + # Decode + if args.use_continuous: + print("\nDecoding from continuous latent space...") + with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu', dtype=torch.float16 if device.type == 'cuda' else torch.float32): + reconstructed_video_vidtok = model.decode(z) + else: + print("\nUsing reconstruction from forward pass...") + + # Convert back to [C, F, H, W] format and [0, 1] range + reconstructed_video_vidtok = reconstructed_video_vidtok.squeeze(0) # [C, T, H, W] + reconstructed_video_vidtok = reconstructed_video_vidtok.permute(0, 1, 2, 3) # [C, T, H, W] -> [C, F, H, W] + # Convert from [-1, 1] to [0, 1] + reconstructed_video = (reconstructed_video_vidtok + 1.0) / 2.0 + reconstructed_video = torch.clamp(reconstructed_video, 0, 1) + + print(f"Reconstructed video shape: {reconstructed_video.shape}") + print(f"Reconstructed video range: [{reconstructed_video.min():.3f}, {reconstructed_video.max():.3f}]") + + # Ensure same number of frames for comparison + F_orig = original_video.shape[1] + F_recon = reconstructed_video.shape[1] + F_min = min(F_orig, F_recon) + + original_video = original_video[:, :F_min, :, :] + reconstructed_video = reconstructed_video[:, :F_min, :, :] + + # Resize if spatial dimensions don't match + if original_video.shape[2:] != reconstructed_video.shape[2:]: + print(f"Resizing reconstructed video from {reconstructed_video.shape[2:]} to {original_video.shape[2:]}") + # Use interpolation to resize + reconstructed_video_resized = torch.zeros_like(original_video) + for f in range(F_min): + frame = reconstructed_video[:, f, :, :].unsqueeze(0) # [1, C, H, W] + frame_resized = torch.nn.functional.interpolate( + frame, size=original_video.shape[2:], mode='bilinear', align_corners=False + ) + reconstructed_video_resized[:, f, :, :] = frame_resized.squeeze(0) + reconstructed_video = reconstructed_video_resized + + # Calculate metrics + print("\nCalculating metrics...") + + # Convert to float32 for metric calculation + orig_f32 = original_video.to(torch.float32).cpu() + recon_f32 = reconstructed_video.to(torch.float32).cpu() + + # Frame-wise metrics + psnr_values = [] + mse_values = [] + ssim_values = [] + + for f in range(F_min): + orig_frame = orig_f32[:, f, :, :] # [C, H, W] + recon_frame = recon_f32[:, f, :, :] # [C, H, W] + + psnr = calculate_psnr(orig_frame, recon_frame) + mse = calculate_mse(orig_frame, recon_frame) + ssim = calculate_ssim(orig_frame, recon_frame) + + psnr_values.append(psnr) + mse_values.append(mse) + ssim_values.append(ssim) + + # Overall metrics + avg_psnr = np.mean(psnr_values) + avg_mse = np.mean(mse_values) + avg_ssim = np.mean(ssim_values) + + print(f"\n=== Metrics ===") + print(f"PSNR: {avg_psnr:.2f} dB (per frame: {psnr_values})") + print(f"MSE: {avg_mse:.6f} (per frame: {mse_values})") + print(f"SSIM: {avg_ssim:.4f} (per frame: {ssim_values})") + + # Save metrics to file + metrics_file = os.path.join(args.output_dir, f"metrics_video_{args.video_index}.txt") + with open(metrics_file, 'w') as f: + f.write(f"Video Index: {args.video_index}\n") + f.write(f"Video Path: {video_path}\n") + f.write(f"Caption: {caption}\n") + f.write(f"Model Config: {args.config}\n") + f.write(f"Model Checkpoint: {args.ckpt}\n") + f.write(f"Use Continuous Latent: {args.use_continuous}\n") + f.write(f"\n=== Metrics ===\n") + f.write(f"Average PSNR: {avg_psnr:.2f} dB\n") + f.write(f"Average MSE: {avg_mse:.6f}\n") + f.write(f"Average SSIM: {avg_ssim:.4f}\n") + f.write(f"\nPer-frame PSNR: {psnr_values}\n") + f.write(f"Per-frame MSE: {mse_values}\n") + f.write(f"Per-frame SSIM: {ssim_values}\n") + + print(f"Saved metrics to: {metrics_file}") + + # Create side-by-side video + print("\nCreating side-by-side comparison video...") + video_output_path = os.path.join(args.output_dir, f"comparison_video_{args.video_index}.mp4") + create_side_by_side_video(original_video, reconstructed_video, video_output_path, fps=8) + + # Create comparison grid + print("Creating comparison grid...") + grid_output_path = os.path.join(args.output_dir, f"comparison_grid_video_{args.video_index}.png") + create_comparison_grid(original_video, reconstructed_video, grid_output_path, nrow=4) + + print(f"\n=== Test Complete ===") + print(f"Results saved to: {args.output_dir}") + print(f" - Metrics: {metrics_file}") + print(f" - Side-by-side video: {video_output_path}") + print(f" - Comparison grid: {grid_output_path}") + + +if __name__ == "__main__": + main() + diff --git a/Meissonic/VidTok/test_vidtok.sh b/Meissonic/VidTok/test_vidtok.sh new file mode 100644 index 0000000000000000000000000000000000000000..1f05c470bb98dd86e3bc24e83ab65183e4ac2edd --- /dev/null +++ b/Meissonic/VidTok/test_vidtok.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Test script for VidTok tokenizer performance +# VidTok models will be automatically downloaded from HuggingFace + +# Option 1: Use HuggingFace model ID (recommended - simplest) +python test_vidtok.py \ + --ckpt "microsoft/VidTok" \ + --model_name "vidtok_kl_causal_488_16chn_v1_1" \ + --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ + --video_index 3 \ + --num_frames 17 \ + --height 240 \ + --width 424 \ + --output_dir "./vidtok_test_output" \ + --device cuda + +# Option 2: Use local paths (if you have VidTok repository cloned) +# First, clone VidTok: git clone https://github.com/microsoft/VidTok +# Then add to PYTHONPATH: +# export PYTHONPATH="${PYTHONPATH}:$(pwd)/VidTok" +# +# python train/test_vidtok.py \ +# --config "VidTok/configs/vidtok_kl_causal_488_4chn.yaml" \ +# --ckpt "VidTok/checkpoints/vidtok_kl_causal_488_4chn.ckpt" \ +# --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ +# --video_index 0 \ +# --num_frames 17 \ +# --height 256 \ +# --width 256 \ +# --output_dir "./vidtok_test_output" \ +# --device cuda + +# Available model names: +# - vidtok_kl_causal_488_4chn (KL divergence, causal, 4 channels) +# - vidtok_kl_noncausal_488_4chn (KL divergence, non-causal, 4 channels) +# - vidtok_fsq_causal_488_4096 (FSQ, causal, codebook size 4096) +# - vidtok_fsq_causal_488_262144 (FSQ, causal, codebook size 262144) +# - vidtok_kl_causal_41616_4chn (KL divergence, causal, larger model) +# See https://huggingface.co/microsoft/VidTok/tree/main/checkpoints for all models + diff --git a/Meissonic/VidTok/vidtok/data/__pycache__/video_read.cpython-310.pyc b/Meissonic/VidTok/vidtok/data/__pycache__/video_read.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..765ac789bf7eb005946e803ad7e074cb463e0e64 Binary files /dev/null and b/Meissonic/VidTok/vidtok/data/__pycache__/video_read.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc b/Meissonic/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d55b5c7b63ae35cd09bf4f00a149d7e28fecb69 Binary files /dev/null and b/Meissonic/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/data/datamodule.py b/Meissonic/VidTok/vidtok/data/datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..c405b84703735b9a076b9ffb4e74de673470c15b --- /dev/null +++ b/Meissonic/VidTok/vidtok/data/datamodule.py @@ -0,0 +1,150 @@ +import numpy as np +from functools import partial + +import torch +import lightning.pytorch as pl +from torch.utils.data import DataLoader, Dataset, IterableDataset + +from vidtok.modules.util import instantiate_from_config + + +class WrappedDataset(Dataset): + """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset""" + + def __init__(self, dataset): + self.data = dataset + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + +def worker_init_fn(_): + worker_info = torch.utils.data.get_worker_info() + + dataset = worker_info.dataset + worker_id = worker_info.id + + if isinstance(dataset, IterableDataset): + split_size = dataset.num_records // worker_info.num_workers + # reset num_records to the true number to retain reliable length information + dataset.sample_ids = dataset.valid_ids[ + worker_id * split_size : (worker_id + 1) * split_size + ] + current_id = np.random.choice(len(np.random.get_state()[1]), 1) + return np.random.seed(np.random.get_state()[1][current_id] + worker_id) + else: + return np.random.seed(np.random.get_state()[1][0] + worker_id) + + +class DataModuleFromConfig(pl.LightningDataModule): + def __init__( + self, + batch_size, + train=None, + validation=None, + test=None, + predict=None, + wrap=False, + num_workers=None, + pin_train_memory=True, + is_iterable_dataset=False, + shuffle_test_loader=False, + use_worker_init_fn=False, + shuffle_val_dataloader=False, + ): + super().__init__() + self.batch_size = batch_size + self.dataset_configs = dict() + self.num_workers = num_workers if num_workers is not None else batch_size * 2 + self.pin_train_memory = pin_train_memory + self.is_iterable_dataset = is_iterable_dataset + self.use_worker_init_fn = use_worker_init_fn + if train is not None: + self.dataset_configs["train"] = train + self.train_dataloader = self._train_dataloader + if validation is not None: + self.dataset_configs["validation"] = validation + self.val_dataloader = partial( + self._val_dataloader, shuffle=shuffle_val_dataloader + ) + if test is not None: + self.dataset_configs["test"] = test + self.test_dataloader = partial( + self._test_dataloader, shuffle=shuffle_test_loader + ) + if predict is not None: + self.dataset_configs["predict"] = predict + self.predict_dataloader = self._predict_dataloader + self.wrap = wrap + + def prepare_data(self): + for data_cfg in self.dataset_configs.values(): + instantiate_from_config(data_cfg) + + def setup(self, stage=None): + self.datasets = dict( + (k, instantiate_from_config(self.dataset_configs[k])) + for k in self.dataset_configs + ) + if self.wrap: + for k in self.datasets: + self.datasets[k] = WrappedDataset(self.datasets[k]) + + def _train_dataloader(self): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["train"], + batch_size=self.batch_size, + num_workers=self.num_workers, + pin_memory=self.pin_train_memory, + shuffle=False if self.is_iterable_dataset else True, + worker_init_fn=init_fn, + ) + + def _val_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["validation"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + shuffle=shuffle, + ) + + def _test_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + + # do not shuffle dataloader for iterable dataset + shuffle = shuffle and (not self.is_iterable_dataset) + + return DataLoader( + self.datasets["test"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + shuffle=shuffle, + ) + + def _predict_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["predict"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + ) diff --git a/Meissonic/VidTok/vidtok/data/video_read.py b/Meissonic/VidTok/vidtok/data/video_read.py new file mode 100644 index 0000000000000000000000000000000000000000..357cd48305141ee70924fd6a79adbe5cb7f6ca5b --- /dev/null +++ b/Meissonic/VidTok/vidtok/data/video_read.py @@ -0,0 +1,88 @@ +import os +import random +import decord +import numpy as np +import torch + +from vidtok.modules.util import print0 + +decord.bridge.set_bridge("torch") + + +def sample_frames_with_fps( + total_frames, + video_fps, + sample_num_frames, + sample_fps, + start_index=None +): + """sample frames proportional to the length of the frames in one second + e.g., 1s video has 30 frames, when 'fps'=3, we sample frames with spacing of 30/3=10 + return the frame indices + + Parameters + ---------- + total_frames : length of the video + video_fps : original fps of the video + sample_num_frames : number of frames to sample + sample_fps : the fps to sample frames + start_index : the starting frame index. If it is not None, it will be used as the starting frame index + + Returns + ------- + frame indices + """ + sample_num_frames = min(sample_num_frames, total_frames) + interval = round(video_fps / sample_fps) + frames_range = (sample_num_frames - 1) * interval + 1 + + if start_index is not None: + start = start_index + elif total_frames - frames_range - 1 < 0: + start = 0 + else: + start = random.randint(0, total_frames - frames_range - 1) + + frame_idxs = np.linspace( + start=start, stop=min(total_frames - 1, start + frames_range), num=sample_num_frames + ).astype(int) + + return frame_idxs + + +def read_frames_with_decord( + video_path, + sample_num_frames, + sample_fps, + start_index=None +) -> tuple[torch.Tensor, list[int]]: + """read frames from video path using decord + + Parameters + ---------- + video_path : path to video + sample_num_frames : number of frames to sample + sample_fps : the fps to sample frames + start_index : the starting frame index. If it is not None, it will be used as the starting frame index + + Returns + ------- + frames (tensor 0~1), frame indices + """ + video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(video_reader) + video_fps = video_reader.get_avg_fps() # note that the fps here is float. + frame_idxs = sample_frames_with_fps( + total_frames=total_frames, + video_fps=video_fps, + sample_num_frames=sample_num_frames, + sample_fps=sample_fps, + start_index=start_index + ) + frames = video_reader.get_batch(frame_idxs) + frames = frames.float() / 255 + frames = frames.permute(0, 3, 1, 2) + if (frames.shape[0] != sample_num_frames) or (len(frame_idxs) != sample_num_frames): + print0(f"[bold yellow]\[vidtok.data.video_read][read_frames_with_decord][/bold yellow] Warning: need {sample_num_frames} frames, " + f"but got {frames.shape[0]} frames, {len(frame_idxs)} frame indices, video_path={video_path}.") + return frames, frame_idxs diff --git a/Meissonic/VidTok/vidtok/data/vidtok.py b/Meissonic/VidTok/vidtok/data/vidtok.py new file mode 100644 index 0000000000000000000000000000000000000000..4b1898d4718de1450da96f25e9134dd4e1084cc1 --- /dev/null +++ b/Meissonic/VidTok/vidtok/data/vidtok.py @@ -0,0 +1,333 @@ +import os +import glob +from typing import Union + +import decord +import numpy as np +import pandas as pd +import torch +from PIL import Image +from torch.utils.data import Dataset +from torchvision.transforms import v2 +from tqdm import trange + +from vidtok.modules.util import print0 +from .video_read import read_frames_with_decord + + +class VidTokDataset(Dataset): + def __init__( + self, + data_dir: str, + meta_path: str, + video_params: dict, + data_frac: float = 1.0, + is_strict_loading: bool = False, + skip_missing_files: bool = True, + start_index: Union[None, int] = None + ): + super().__init__() + + self.data_dir = data_dir + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Use data dir: {self.data_dir}") + + self.meta_path = meta_path + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Use meta path: {self.meta_path}") + + self.video_params = video_params + + self.data_frac = data_frac + self.is_strict_loading = is_strict_loading + self.skip_missing_files = skip_missing_files + self.start_index = start_index + self.transforms = self._get_transforms( + video_params["input_height"], + video_params["input_width"], + ) + + self.missing_files = [] + self._load_metadata() + + def _get_transforms(self, input_height, input_width, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]): + normalize = v2.Normalize(mean=norm_mean, std=norm_std) + return v2.Compose( + [ + v2.Resize(input_height, antialias=True), + v2.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + def _load_metadata(self): + metadata = pd.read_csv( + self.meta_path, + on_bad_lines="skip", + encoding="ISO-8859-1", + engine="python", + sep=",",) + + if self.data_frac < 1: + metadata = metadata.sample(frac=self.data_frac) + self.metadata = metadata + self.metadata.dropna(inplace=True) + + def _get_video_path(self, sample): + """reduce the access to the disk + """ + rel_video_fp = str(sample["videos"]) + abs_video_fp = os.path.join(self.data_dir, rel_video_fp) + return abs_video_fp, rel_video_fp + + def __len__(self): + return len(self.metadata) + + def __getitem__(self, item): + item = item % len(self.metadata) + sample = self.metadata.iloc[item] + video_fp, _ = self._get_video_path(sample) + + try: + if os.path.isfile(video_fp): + imgs, idxs = read_frames_with_decord( + video_path=video_fp, + sample_num_frames=self.video_params["sample_num_frames"], + sample_fps=self.video_params["sample_fps"], + start_index=self.start_index + ) + else: + # if the video file is missing + if video_fp not in self.missing_files: + self.missing_files.append(video_fp) + # resample another video or not + if self.skip_missing_files: + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Warning: missing video file {video_fp}. Resampling another video.") + return self.__getitem__(np.random.choice(self.__len__())) + else: + raise ValueError(f"Video file {video_fp} is missing, skip_missing_files={self.skip_missing_files}.") + except Exception as e: + # if the video exists, but loading failed + if self.is_strict_loading: + raise ValueError(f"Video loading failed for {video_fp}, is_strict_loading={self.is_strict_loading}.") from e + else: + print0("[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Warning: using the pure black image as the frame sample") + imgs = Image.new("RGB", (self.video_params["input_width"], self.video_params["input_height"]), (0, 0, 0)) + imgs = v2.ToTensor()(imgs).unsqueeze(0) + + if self.transforms is not None: + # imgs: (T, C, H, W) + imgs = self.transforms(imgs) + + if imgs.shape[0] < self.video_params["sample_num_frames"]: + imgs = torch.cat([imgs, imgs[-1].unsqueeze(0).repeat(self.video_params["sample_num_frames"] - imgs.shape[0], 1, 1, 1)], dim=0) + + imgs = imgs.permute(1, 0, 2, 3) # (C, T, H, W) + + return { + 'jpg': imgs, + "path": video_fp + } + + +class VidTokValDataset(Dataset): + def __init__( + self, + data_dir: str, + video_params: dict, + meta_path: Union[None, str] = None, + pre_load_frames: bool = True, + is_strict_loading: bool = True, + last_frames_handle: str = "repeat", # 'repeat', 'drop' + skip_missing_files: bool = False, + read_long_video: bool = False, + chunk_size: int = 16, + is_causal: bool = True, + ): + super().__init__() + + self.data_dir = data_dir + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Use data dir: {self.data_dir}" + ) + + self.meta_path = meta_path + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Use meta path: {self.meta_path}" + ) + + self.video_params = video_params + self.read_long_video = read_long_video + self.chunk_size = chunk_size + self.is_causal = is_causal + + self.is_strict_loading = is_strict_loading + self.last_frames_handle = last_frames_handle + self.skip_missing_files = skip_missing_files + self.transforms = self._get_transforms( + video_params["input_height"], + video_params["input_width"], + ) + + self.missing_files = [] + self._load_metadata() + self._load_every_frame_from_meta() + + if pre_load_frames: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Pre-loading all frames into CPU..." + ) + self._pre_load_frames() + + def _get_transforms(self, input_height, input_width, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]): + normalize = v2.Normalize(mean=norm_mean, std=norm_std) + return v2.Compose( + [ + v2.Resize(input_height, antialias=True), + v2.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + def _load_metadata(self): + if self.meta_path is not None: + metadata = pd.read_csv( + self.meta_path, + on_bad_lines="skip", + encoding="ISO-8859-1", + engine="python", + sep=",", + ) + self.metadata = metadata + self.metadata.dropna(inplace=True) + else: + self.metadata = glob.glob(os.path.join(self.data_dir, '**', '*.mp4'), recursive=True) + + def _load_every_frame_from_meta(self): + decord.bridge.set_bridge("torch") + self.frames_batch = [] + for video_idx in range(len(self.metadata)): + try: + sample = self.metadata.iloc[video_idx] + video_fp, _ = self._get_video_path(sample) + except: + video_fp = self.metadata[video_idx] + if os.path.isfile(video_fp): + video_reader = decord.VideoReader(video_fp, num_threads=0) + total_frames = len(video_reader) + fps = video_reader.get_avg_fps() # float + interval = round(fps / self.video_params["sample_fps"]) + frame_ids = list(range(0, total_frames, interval)) + + if self.read_long_video: + video_length = len(frame_ids) + if self.is_causal and video_length > self.chunk_size: + num_frames_ids = frame_ids[:self.chunk_size * ((video_length - 1) // self.chunk_size) + 1] + elif not self.is_causal and video_length >= self.chunk_size: + num_frames_ids = frame_ids[:self.chunk_size * (video_length // self.chunk_size)] + else: + continue + self.frames_batch.append( + { + "video_fp": video_fp, + "num_frames_ids": num_frames_ids, + } + ) + else: + for x in range(0, len(frame_ids), self.video_params["sample_num_frames"]): + num_frames_ids = frame_ids[x : x + self.video_params["sample_num_frames"]] + if len(num_frames_ids) < self.video_params["sample_num_frames"]: + if self.last_frames_handle == "repeat": + num_frames_ids += [num_frames_ids[-1]] * ( + self.video_params["sample_num_frames"] - len(num_frames_ids) + ) + elif self.last_frames_handle == "drop": + continue + else: + raise ValueError(f"Invalid last_frames_handle: {self.last_frames_handle}") + self.frames_batch.append( + { + "video_fp": video_fp, + "num_frames_ids": num_frames_ids, + } + ) + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Loaded all frames index from {len(self.metadata)} videos." + ) + + def _pre_load_frames(self): + last_video_fp = None + for idx in trange(len(self.frames_batch), desc="Pre-loading all frames"): + if self.frames_batch[idx]["video_fp"] != last_video_fp: + video_reader = decord.VideoReader(self.frames_batch[idx]["video_fp"], num_threads=0) + last_video_fp = self.frames_batch[idx]["video_fp"] + self.frames_batch[idx]["frames"] = ( + video_reader.get_batch(self.frames_batch[idx]["num_frames_ids"]).permute(0, 3, 1, 2).float() + / 255.0 + ) + + def _get_video_path(self, sample): + """reduce the access to the disk""" + rel_video_fp = str(sample["videos"]) + abs_video_fp = os.path.join(self.data_dir, rel_video_fp) + return abs_video_fp, rel_video_fp + + def __len__(self): + return len(self.frames_batch) + + def __getitem__(self, item): + video_fp = self.frames_batch[item]["video_fp"] + + try: + if "frames" in self.frames_batch[item]: + imgs = self.frames_batch[item]["frames"] + elif os.path.isfile(video_fp): + video_reader = decord.VideoReader(video_fp, num_threads=0) + imgs = ( + video_reader.get_batch(self.frames_batch[item]["num_frames_ids"]).permute(0, 3, 1, 2).float() + / 255.0 + ) + else: + # if the video file is missing + if video_fp not in self.missing_files: + self.missing_files.append(video_fp) + # resample another video or not + if self.skip_missing_files: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: missing video file {video_fp}. Resampling another video." + ) + return self.__getitem__(np.random.choice(self.__len__())) + else: + raise ValueError(f"Video file {video_fp} is missing, skip_missing_files={self.skip_missing_files}.") + except Exception as e: + # if the video exists, but loading failed + if self.is_strict_loading: + raise ValueError( + f"Video loading failed for {video_fp}, is_strict_loading={self.is_strict_loading}." + ) from e + else: + print0( + "[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: using the pure black image as the frame sample" + ) + imgs = Image.new( + "RGB", (self.video_params["input_width"], self.video_params["input_height"]), (0, 0, 0) + ) + imgs = v2.ToTensor()(imgs).unsqueeze(0) + + if self.transforms is not None: + imgs = self.transforms(imgs) + + if not self.read_long_video: + if imgs.shape[0] < self.video_params["sample_num_frames"]: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: video {video_fp} has less frames {imgs.shape[0]} than sample_num_frames {self.video_params['sample_num_frames']}." + ) + imgs = torch.cat( + [imgs, imgs[-1].unsqueeze(0).repeat(self.video_params["sample_num_frames"] - imgs.shape[0], 1, 1, 1)], + dim=0, + ) + + imgs = imgs.permute(1, 0, 2, 3) # (C, T, H, W) + + return { + "jpg": imgs, + "path": video_fp, + } diff --git a/Meissonic/VidTok/vidtok/models/__pycache__/autoencoder_v1_1.cpython-310.pyc b/Meissonic/VidTok/vidtok/models/__pycache__/autoencoder_v1_1.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5011d2f6bc8b7a1917c4520856a886063a277cc5 Binary files /dev/null and b/Meissonic/VidTok/vidtok/models/__pycache__/autoencoder_v1_1.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/models/autoencoder.py b/Meissonic/VidTok/vidtok/models/autoencoder.py new file mode 100644 index 0000000000000000000000000000000000000000..96da5e6c74621b4f82538453e850567245adbbb0 --- /dev/null +++ b/Meissonic/VidTok/vidtok/models/autoencoder.py @@ -0,0 +1,517 @@ +import re +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union, Optional, List +from omegaconf import ListConfig +from packaging import version + +import torch +import lightning.pytorch as pl + +from safetensors.torch import load_file as load_safetensors +from vidtok.modules.ema import LitEma +from vidtok.modules.util import (default, get_obj_from_str, + instantiate_from_config, print0) +from vidtok.modules.regularizers import pack_one, unpack_one, rearrange + + +class AbstractAutoencoder(pl.LightningModule): + """ + This is the base class for all autoencoders + """ + + def __init__( + self, + ema_decay: Union[None, float] = None, + monitor: Union[None, str] = None, + mode: Union[None, str] = None, + input_key: str = "jpg", + ): + super().__init__() + + self.input_key = input_key + self.use_ema = ema_decay is not None + self.ema_decay = ema_decay + if monitor is not None: + self.monitor = monitor + if mode is not None: + self.mode = mode + + if version.parse(torch.__version__) >= version.parse("2.0.0"): + self.automatic_optimization = False + + @abstractmethod + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + raise NotImplementedError() + + @abstractmethod + def get_input(self, batch) -> Any: + raise NotImplementedError() + + def on_train_batch_end(self, *args, **kwargs): + # for EMA computation + if self.use_ema: + self.model_ema(self) + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Switched to EMA weights" + ) + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Restored training weights" + ) + + @abstractmethod + def encode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class called" + ) + + @abstractmethod + def decode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class called" + ) + + def instantiate_optimizer_from_config(self, params, lr, cfg): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> {cfg['target']} <<< optimizer from config" + ) + return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict())) + + @abstractmethod + def configure_optimizers(self) -> Any: + raise NotImplementedError() + + +class AutoencodingEngine(AbstractAutoencoder): + """ + Base class for all video tokenizers that we train + """ + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + verbose = kwargs.pop("verbose", True) + super().__init__(*args, **kwargs) + + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + self.optimizer_config = default(optimizer_config, {"target": "torch.optim.Adam"}) + self.lr_g_factor = lr_g_factor + self.is_causal = self.encoder.is_causal + + self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds) + self.use_tiling = False + # Decode more latent frames at once + self.num_sample_frames_batch_size = 16 + self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = 256 + self.tile_sample_min_width = 256 + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = 0 # 1 / 8 + self.tile_overlap_factor_width = 0 # 1 / 8 + + if self.use_ema: + self.model_ema = LitEma(self, decay=self.ema_decay) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of {len(list(self.model_ema.buffers()))}." + ) + + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: {ckpt_path}" + ) + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, verbose=verbose) + + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + if path.endswith("ckpt"): + ckpt = torch.load(path, map_location="cpu") + weights = ckpt["state_dict"] if "state_dict" in ckpt else ckpt + elif path.endswith("safetensors"): + weights = load_safetensors(path) + else: + raise NotImplementedError(f"Unknown checkpoint: {path}") + + keys = list(weights.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key {k} from state_dict." + ) + del weights[k] + + missing, unexpected = self.load_state_dict(weights, strict=False) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if verbose: + if len(missing) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: {missing}" + ) + if len(unexpected) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: {unexpected}" + ) + + def get_input(self, batch: Dict) -> torch.Tensor: + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(filter(lambda p: p.requires_grad, self.encoder.parameters())) + + list(filter(lambda p: p.requires_grad, self.decoder.parameters())) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + self.use_tiling = False + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + if self.use_tiling: + z = self.tile_encode(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + else: + z = self.encoder(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + + if return_reg_log: + return z, reg_log + return z + + def tile_encode(self, x: Any) -> Any: + + num_frames, height, width = x.shape[-3:] + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = [[0, num_frames]] + result_z = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile = self.encoder(tile) + result_z.append(tile) + + row.append(torch.cat(result_z, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + enc = torch.cat(result_rows, dim=3) + + return enc + + def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + token_indices = rearrange(token_indices, "... -> ... 1") + token_indices, ps = pack_one(token_indices, "b * d") + codes = self.regularization.indices_to_codes(token_indices) + codes = rearrange(codes, "b d n c -> b n (c d)") + z = self.regularization.project_out(codes) + z = unpack_one(z, ps, "b * d") + z = rearrange(z, "b ... d -> b d ...") + return z + + def decode(self, z: Any, decode_from_indices: bool = False) -> torch.Tensor: + if decode_from_indices: + z = self.indices_to_latent(z) + if self.use_tiling: + x = self.tile_decode(z) + else: + x = self.decoder(z) + return x + + def tile_decode(self, z: Any) -> torch.Tensor: + + num_frames, height, width = z.shape[-3:] + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = [[0, num_frames]] + time = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + tile = z[ + :, + :, + start_frame : end_frame, + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + tile = self.decoder(tile) + if self.is_causal and end_frame + 1 <= num_frames: + tile = tile[:, :, : -self.encoder.time_downsample_factor] + time.append(tile) + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + return dec + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.encoder.fix_encoder: + with torch.no_grad(): + z, reg_log = self.encode(x, return_reg_log=True) + else: + z, reg_log = self.encode(x, return_reg_log=True) + + dec = self.decode(z) + return z, dec, reg_log + + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + opt_g, opt_d = self.optimizers() + + # autoencode loss + self.toggle_optimizer(opt_g) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + + # gradient clip + torch.nn.utils.clip_grad_norm_(self.get_autoencoder_params(), 20.0) + opt_g.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + torch.nn.utils.clip_grad_norm_(self.get_discriminator_params(), 20.0) + opt_d.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + lr = opt_g.param_groups[0]["lr"] + self.log( + "lr_abs", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + ) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + return log_dict_ae + + def configure_optimizers(self) -> Any: + ae_params = self.get_autoencoder_params() + disc_params = self.get_discriminator_params() + + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + opt_disc = self.instantiate_optimizer_from_config(disc_params, self.learning_rate, self.optimizer_config) + + return [opt_ae, opt_disc], [] + + @torch.no_grad() + def log_images(self, batch: Dict) -> Dict: + log = dict() + x = self.get_input(batch) + _, xrec, _ = self(x) + log["inputs"] = x + log["recs"] = xrec + with self.ema_scope(): + _, xrec_ema, _ = self(x) + log["recs_ema"] = xrec_ema + return log diff --git a/Meissonic/VidTok/vidtok/models/autoencoder_v1_1.py b/Meissonic/VidTok/vidtok/models/autoencoder_v1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1182f573eee05b634daeb9a065b7adaacc600b --- /dev/null +++ b/Meissonic/VidTok/vidtok/models/autoencoder_v1_1.py @@ -0,0 +1,588 @@ +import re +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union, Optional, List +from omegaconf import ListConfig +from packaging import version + +import torch +import lightning.pytorch as pl + +from safetensors.torch import load_file as load_safetensors +from vidtok.modules.ema import LitEma +from vidtok.modules.util import (default, get_obj_from_str, + instantiate_from_config, print0) +from vidtok.modules.regularizers import pack_one, unpack_one, rearrange + + +class AbstractAutoencoder(pl.LightningModule): + """ + This is the base class for all autoencoders + """ + + def __init__( + self, + ema_decay: Union[None, float] = None, + monitor: Union[None, str] = None, + mode: Union[None, str] = None, + input_key: str = "jpg", + ): + super().__init__() + + self.input_key = input_key + self.use_ema = ema_decay is not None + self.ema_decay = ema_decay + if monitor is not None: + self.monitor = monitor + if mode is not None: + self.mode = mode + + if version.parse(torch.__version__) >= version.parse("2.0.0"): + self.automatic_optimization = False + + @abstractmethod + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + raise NotImplementedError() + + @abstractmethod + def get_input(self, batch) -> Any: + raise NotImplementedError() + + def on_train_batch_end(self, *args, **kwargs): + # for EMA computation + if self.use_ema: + self.model_ema(self) + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Switched to EMA weights" + ) + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Restored training weights" + ) + + @abstractmethod + def encode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class called" + ) + + @abstractmethod + def decode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class called" + ) + + def instantiate_optimizer_from_config(self, params, lr, cfg): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> {cfg['target']} <<< optimizer from config" + ) + return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict())) + + @abstractmethod + def configure_optimizers(self) -> Any: + raise NotImplementedError() + + +class AutoencodingEngine(AbstractAutoencoder): + """ + Base class for all video tokenizers that we train + """ + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + use_tiling: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + verbose = kwargs.pop("verbose", True) + self.use_tiling = kwargs.pop("use_tiling", False) + self.t_chunk_enc = kwargs.pop("t_chunk_enc", 16) + super().__init__(*args, **kwargs) + + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + self.optimizer_config = default(optimizer_config, {"target": "torch.optim.Adam"}) + self.lr_g_factor = lr_g_factor + + self.t_chunk_dec = self.t_chunk_enc // self.encoder.time_downsample_factor + self.use_overlap = False + self.is_causal = self.encoder.is_causal + + self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds) + + self.use_tiling = use_tiling + # Decode more latent frames at once + self.num_sample_frames_batch_size = 16 + self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio + + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = 256 + self.tile_sample_min_width = 256 + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = 0 # 1 / 8 + self.tile_overlap_factor_width = 0 # 1 / 8 + + if self.use_ema: + self.model_ema = LitEma(self, decay=self.ema_decay) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of {len(list(self.model_ema.buffers()))}." + ) + + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: {ckpt_path}" + ) + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, verbose=verbose) + + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + if path.endswith("ckpt"): + ckpt = torch.load(path, map_location="cpu") + weights = ckpt["state_dict"] if "state_dict" in ckpt else ckpt + elif path.endswith("safetensors"): + weights = load_safetensors(path) + else: + raise NotImplementedError(f"Unknown checkpoint: {path}") + + keys = list(weights.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key {k} from state_dict." + ) + del weights[k] + + missing, unexpected = self.load_state_dict(weights, strict=False) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if verbose: + if len(missing) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: {missing}" + ) + if len(unexpected) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: {unexpected}" + ) + + def get_input(self, batch: Dict) -> torch.Tensor: + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(filter(lambda p: p.requires_grad, self.encoder.parameters())) + + list(filter(lambda p: p.requires_grad, self.decoder.parameters())) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + def _empty_causal_cached(self, parent): + for name, module in parent.named_modules(): + if hasattr(module, 'causal_cache'): + module.causal_cache = None + + def _set_first_chunk(self, is_first_chunk=True): + for module in self.modules(): + if hasattr(module, 'is_first_chunk'): + module.is_first_chunk = is_first_chunk + + def _set_cache_offset(self, modules, cache_offset=0): + for module in modules: + for submodule in module.modules(): + if hasattr(submodule, 'cache_offset'): + submodule.cache_offset = cache_offset + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def build_chunk_start_end(self, t, decoder_mode=False): + start_end = [[0, 1]] + start = 1 + end = start + while True: + if start >= t: + break + end = min(t, end + (self.t_chunk_dec if decoder_mode else self.t_chunk_enc)) + start_end.append([start, end]) + start = end + return start_end + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + self.use_tiling = False + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + self._empty_causal_cached(self.encoder) + self._set_first_chunk(True) + + if self.use_tiling: + z = self.tile_encode(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + else: + z = self.encoder(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + + if return_reg_log: + return z, reg_log + return z + + def tile_encode(self, x: Any) -> Any: + + num_frames, height, width = x.shape[-3:] + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + rows = [] + + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = self.build_chunk_start_end(num_frames) + result_z = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + self._set_first_chunk(idx == 0) + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile = self.encoder(tile) + result_z.append(tile) + row.append(torch.cat(result_z, dim=2)) + rows.append(row) + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + enc = torch.cat(result_rows, dim=3) + + return enc + + def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + assert token_indices.dim() == 4, "token_indices should be of shape (b, t, h, w)" + b, t, h, w = token_indices.shape + token_indices = token_indices.unsqueeze(-1).reshape(b, -1, 1) + codes = self.regularization.indices_to_codes(token_indices) + codes = codes.permute(0, 2, 3, 1).reshape(b, codes.shape[2], -1) + z = self.regularization.project_out(codes) + return z.reshape(b, t, h, w, -1).permute(0, 4, 1, 2, 3) + + def tile_indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + num_frames = token_indices.shape[1] + start_end = self.build_chunk_start_end(num_frames, decoder_mode=True) + result_z = [] + for (start, end) in start_end: + chunk = token_indices[:, start:end, :, :] + chunk_z = self.indices_to_latent(chunk) + result_z.append(chunk_z.clone()) + return torch.cat(result_z, dim=2) + + def decode(self, z: Any, decode_from_indices: bool = False) -> torch.Tensor: + if decode_from_indices: + if self.use_tiling: + z = self.tile_indices_to_latent(z) + else: + z = self.indices_to_latent(z) + self._empty_causal_cached(self.decoder) + self._set_first_chunk(True) + + if self.use_tiling: + x = self.tile_decode(z) + else: + x = self.decoder(z) + return x + + + def tile_decode(self, z: Any) -> torch.Tensor: + + num_frames, height, width = z.shape[-3:] + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + if self.is_causal: + assert self.encoder.time_downsample_factor in [2, 4, 8], "Only support 2x, 4x or 8x temporal downsampling now." + if self.encoder.time_downsample_factor == 4: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 2) + self._set_cache_offset([self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out], 4) + elif self.encoder.time_downsample_factor == 2: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1], self.decoder.up_temporal[0], self.decoder.conv_out], 2) + else: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[3].upsample, self.decoder.up_temporal[2]], 2) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 4) + self._set_cache_offset([self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out], 8) + + start_end = self.build_chunk_start_end(num_frames, decoder_mode=True) + time = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + self._set_first_chunk(idx == 0) + tile = z[ + :, + :, + start_frame : (end_frame + 1 if self.is_causal and end_frame + 1 <= num_frames else end_frame), + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + tile = self.decoder(tile) + if self.is_causal and end_frame + 1 <= num_frames: + tile = tile[:, :, : -self.encoder.time_downsample_factor] + time.append(tile) + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + return dec + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.encoder.fix_encoder: + with torch.no_grad(): + z, reg_log = self.encode(x, return_reg_log=True) + else: + z, reg_log = self.encode(x, return_reg_log=True) + dec = self.decode(z) + if dec.shape[2] != x.shape[2]: + dec = dec[:, :, -x.shape[2]:, ...] + return z, dec, reg_log + + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + opt_g, opt_d = self.optimizers() + + # autoencode loss + self.toggle_optimizer(opt_g) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + + # gradient clip + torch.nn.utils.clip_grad_norm_(self.get_autoencoder_params(), 20.0) + opt_g.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + torch.nn.utils.clip_grad_norm_(self.get_discriminator_params(), 20.0) + opt_d.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + lr = opt_g.param_groups[0]["lr"] + self.log( + "lr_abs", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + ) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + return log_dict_ae + + def configure_optimizers(self) -> Any: + ae_params = self.get_autoencoder_params() + disc_params = self.get_discriminator_params() + + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + opt_disc = self.instantiate_optimizer_from_config(disc_params, self.learning_rate, self.optimizer_config) + + return [opt_ae, opt_disc], [] + + @torch.no_grad() + def log_images(self, batch: Dict) -> Dict: + log = dict() + x = self.get_input(batch) + _, xrec, _ = self(x) + log["inputs"] = x + log["recs"] = xrec + with self.ema_scope(): + _, xrec_ema, _ = self(x) + log["recs_ema"] = xrec_ema + return log \ No newline at end of file diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/discriminator.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/discriminator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f06e0f1b5bfbcb1546a046217de1e26d4fc105a Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/discriminator.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/distributions.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/distributions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0392cf08e73bca7b4dc09f7b34b26940f154872c Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/distributions.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/ema.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/ema.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4da146078e3dc25acecdca337e11233bb1d6c25 Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/ema.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/losses.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/losses.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cff63e976a587d14dfdea56824f1b435d1b35e2f Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/losses.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/lpips.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/lpips.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e05c31a332e050545c4c764ca231ecd6dc0d608e Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/lpips.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/model_3dcausal_v1_1.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/model_3dcausal_v1_1.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69191a078f4d9a177c49c72d203408df55d824d4 Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/model_3dcausal_v1_1.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/regularizers.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/regularizers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe3c22a6e0b59f3770a016344abfa2f7f0214e77 Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/regularizers.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc b/Meissonic/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..901aa877898a6ff71557c21b252b206b280e9154 Binary files /dev/null and b/Meissonic/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok/modules/discriminator.py b/Meissonic/VidTok/vidtok/modules/discriminator.py new file mode 100644 index 0000000000000000000000000000000000000000..f9d94b21b22f5019f3cdcc4cbf2e98bd0ce0ee02 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/discriminator.py @@ -0,0 +1,201 @@ +import functools + +import torch +import torch.nn as nn + + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + nn.init.normal_(m.weight.data, 0.0, 0.02) + elif classname.find("BatchNorm") != -1: + nn.init.normal_(m.weight.data, 1.0, 0.02) + nn.init.constant_(m.bias.data, 0) + + +class ActNorm(nn.Module): + def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False): + assert affine + super().__init__() + self.logdet = logdet + self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1)) + self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1)) + self.allow_reverse_init = allow_reverse_init + + self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8)) + + def initialize(self, input): + with torch.no_grad(): + flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1) + mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3) + std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3) + + self.loc.data.copy_(-mean) + self.scale.data.copy_(1 / (std + 1e-6)) + + def forward(self, input, reverse=False): + if reverse: + return self.reverse(input) + if len(input.shape) == 2: + input = input[:, :, None, None] + squeeze = True + else: + squeeze = False + + _, _, height, width = input.shape + + if self.training and self.initialized.item() == 0: + self.initialize(input) + self.initialized.fill_(1) + + h = self.scale * (input + self.loc) + + if squeeze: + h = h.squeeze(-1).squeeze(-1) + + if self.logdet: + log_abs = torch.log(torch.abs(self.scale)) + logdet = height * width * torch.sum(log_abs) + logdet = logdet * torch.ones(input.shape[0]).to(input) + return h, logdet + + return h + + def reverse(self, output): + if self.training and self.initialized.item() == 0: + if not self.allow_reverse_init: + raise RuntimeError( + "Initializing ActNorm in reverse direction is " + "disabled by default. Use allow_reverse_init=True to enable." + ) + else: + self.initialize(output) + self.initialized.fill_(1) + + if len(output.shape) == 2: + output = output[:, :, None, None] + squeeze = True + else: + squeeze = False + + h = output / self.scale - self.loc + + if squeeze: + h = h.squeeze(-1).squeeze(-1) + return h + + +class NLayerDiscriminator(nn.Module): + """Defines a PatchGAN discriminator as in Pix2Pix.""" + # https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py + def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): + """Construct a PatchGAN discriminator + Parameters: + input_nc (int) -- the number of channels in input images + ndf (int) -- the number of filters in the last conv layer + n_layers (int) -- the number of conv layers in the discriminator + """ + super(NLayerDiscriminator, self).__init__() + if not use_actnorm: + norm_layer = nn.BatchNorm2d + else: + norm_layer = ActNorm + if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters + use_bias = norm_layer.func != nn.BatchNorm2d + else: + use_bias = norm_layer != nn.BatchNorm2d + + kw = 4 + padw = 1 + sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + nf_mult = 1 + nf_mult_prev = 1 + for n in range(1, n_layers): # gradually increase the number of filters + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + sequence += [ + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + nf_mult_prev = nf_mult + nf_mult = min(2**n_layers, 8) + sequence += [ + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + sequence += [ + nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + ] # output 1 channel prediction map + self.main = nn.Sequential(*sequence) + + def forward(self, input): + """Standard forward.""" + return self.main(input) + + +class NLayerDiscriminator3D(nn.Module): + """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs.""" + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/losses/discriminator.py + def __init__(self, input_nc=1, ndf=64, n_layers=3, use_actnorm=False): + """ + Construct a 3D PatchGAN discriminator + + Parameters: + input_nc (int) -- the number of channels in input volumes + ndf (int) -- the number of filters in the last conv layer + n_layers (int) -- the number of conv layers in the discriminator + use_actnorm (bool) -- flag to use actnorm instead of batchnorm + """ + super(NLayerDiscriminator3D, self).__init__() + if not use_actnorm: + norm_layer = nn.BatchNorm3d + else: + raise NotImplementedError("Not implemented.") + if type(norm_layer) == functools.partial: + use_bias = norm_layer.func != nn.BatchNorm3d + else: + use_bias = norm_layer != nn.BatchNorm3d + + kw = 3 + padw = 1 + sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + nf_mult = 1 + nf_mult_prev = 1 + for n in range(1, n_layers): # gradually increase the number of filters + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + sequence += [ + nn.Conv3d( + ndf * nf_mult_prev, + ndf * nf_mult, + kernel_size=(kw, kw, kw), + stride=(2 if n == 1 else 1, 2, 2), + padding=padw, + bias=use_bias, + ), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + nf_mult_prev = nf_mult + nf_mult = min(2**n_layers, 8) + sequence += [ + nn.Conv3d( + ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=1, padding=padw, bias=use_bias + ), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + sequence += [ + nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + ] # output 1 channel prediction map + self.main = nn.Sequential(*sequence) + + def forward(self, input): + """Standard forward.""" + return self.main(input) diff --git a/Meissonic/VidTok/vidtok/modules/distributions.py b/Meissonic/VidTok/vidtok/modules/distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..76e814475d4d32b9f5ead736cce3a234bb5a0e5f --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/distributions.py @@ -0,0 +1,49 @@ +import numpy as np +import torch + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.0]) + else: + if other is None: + return 0.5 * torch.sum( + torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3], + ) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + dim=[1, 2, 3], + ) + + def nll(self, sample, dims=[1, 2, 3]): + if self.deterministic: + return torch.Tensor([0.0]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims, + ) + + def mode(self): + return self.mean diff --git a/Meissonic/VidTok/vidtok/modules/ema.py b/Meissonic/VidTok/vidtok/modules/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..9f1f7606c2c9b68ebd2302215a9e08f9f31ed8ab --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/ema.py @@ -0,0 +1,82 @@ +import torch +from torch import nn + + +class LitEma(nn.Module): + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError("Decay must be between 0 and 1") + + self.m_name2s_name = {} + self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32)) + self.register_buffer( + "num_updates", + torch.tensor(0, dtype=torch.int) if use_num_upates else torch.tensor(-1, dtype=torch.int), + ) + + for name, p in model.named_parameters(): + if p.requires_grad: + # remove as '.'-character is not allowed in buffers + s_name = name.replace(".", "") + self.m_name2s_name.update({name: s_name}) + self.register_buffer(s_name, p.clone().detach().data) + + self.collected_params = [] + + def reset_num_updates(self): + del self.num_updates + self.register_buffer("num_updates", torch.tensor(0, dtype=torch.int)) + + def forward(self, model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) + + one_minus_decay = 1.0 - decay + + with torch.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if m_param[key].requires_grad: + sname = self.m_name2s_name[key] + shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) + shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) + else: + assert not key in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if m_param[key].requires_grad: + m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) + else: + assert not key in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.data.copy_(c_param.data) diff --git a/Meissonic/VidTok/vidtok/modules/logger.py b/Meissonic/VidTok/vidtok/modules/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..23c7d066e6508433cb2141c59dbd80cb0030ab6d --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/logger.py @@ -0,0 +1,289 @@ +import os +import numpy as np +import einops +import imageio +from typing import Union +from matplotlib import pyplot as plt +from PIL import Image, ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True # UnidentifiedImageError: https://github.com/python-pillow/Pillow/issues/5631 +from pathlib import Path + +import torch +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +from .util import exists, isheatmap + + +class ImageVideoLogger(Callback): + def __init__( + self, + batch_frequency, + max_samples, + clamp=True, + increase_log_steps=True, + batch_frequency_val=None, + video_fps=8, + rescale=True, + disabled=False, + log_on_batch_idx=True, # log on batch_idx instead of global_step. global_step is fixed in validation. batch_idx restarts at each validation + log_first_step=True, + log_images_kwargs=None, + log_videos_kwargs=None, + log_before_first_step=True, + enable_autocast=True, + ): + super().__init__() + self.enable_autocast = enable_autocast + self.rescale = rescale + self.batch_freq = batch_frequency + self.batch_freq_val = batch_frequency_val if batch_frequency_val is not None else batch_frequency + self.video_fps = video_fps + self.max_samples = max_samples + self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)] + if not increase_log_steps: + self.log_steps = [self.batch_freq] + self.clamp = clamp + self.disabled = disabled + self.log_on_batch_idx = log_on_batch_idx + self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} + self.log_videos_kwargs = log_videos_kwargs if log_videos_kwargs else {} + self.log_first_step = log_first_step + self.log_before_first_step = log_before_first_step + + @rank_zero_only + def log_img_local( + self, + save_dir, + split, + images, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "images", split) + for k in images: + if isheatmap(images[k]): + fig, ax = plt.subplots() + ax = ax.matshow( + images[k].cpu().numpy(), cmap="hot", interpolation="lanczos" + ) + plt.colorbar(ax) + plt.axis("off") + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + plt.savefig(path) + plt.close() + else: + if images[k].ndim == 5: + images[k] = einops.rearrange(images[k], "b c t h w -> (b t) c h w") + nrow = self.log_images_kwargs.get("n_rows", 8) + grid = torchvision.utils.make_grid(images[k], nrow=nrow) + if self.rescale: + grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) + grid = grid.numpy() + grid = (grid * 255).astype(np.uint8) + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + path = os.path.join(root, filename) + os.makedirs(os.path.split(path)[0], exist_ok=True) + img = Image.fromarray(grid) + img.save(path) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "logger_log_image only supports WandbLogger currently" + pl_module.logger.log_image( + key=f"{split}/{k}", + images=[ + img, + ], + step=pl_module.global_step, + ) + + @rank_zero_only + def log_vid_local( + self, + save_dir, + split, + videos, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "videos", split) + for k in videos: + # if is video, we can add captions + if isinstance(videos[k], torch.Tensor) and videos[k].ndim == 5: + if self.rescale: + videos[k] = (videos[k] + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + frames = [videos[k][:, :, i] for i in range(videos[k].shape[2])] + frames = [torchvision.utils.make_grid(each, nrow=4) for each in frames] + frames = [einops.rearrange(each, "c h w -> 1 c h w") for each in frames] + frames = torch.clamp(torch.cat(frames, dim=0), min=0.0, max=1.0) + frames = (frames.numpy() * 255).astype(np.uint8) + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.gif".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + save_numpy_as_gif(frames, path, duration=1 / self.video_fps) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "log_videos only supports WandbLogger currently" + wandb.log({f"{split}/{k}": wandb.Video(frames, fps=self.video_fps)}) # k is str + + @rank_zero_only + def log_img(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + (self.check_frequency(check_idx) or self.check_frequency_val(batch_idx, split)) + and hasattr(pl_module, "log_images") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_images) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + with torch.no_grad(), torch.autocast(enabled=self.enable_autocast, device_type="cuda"): + images = pl_module.log_images(batch) + + for k in images: + N = min(images[k].shape[0], self.max_samples) + if not isheatmap(images[k]): + images[k] = images[k][:N] + if isinstance(images[k], torch.Tensor): + images[k] = images[k].detach().float().cpu() + if self.clamp and not isheatmap(images[k]): + images[k] = torch.clamp(images[k], -1.0, 1.0) + + self.log_img_local( + pl_module.logger.save_dir, + split, + images, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + @rank_zero_only + def log_vid(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + (self.check_frequency(check_idx) or self.check_frequency_val(batch_idx, split)) + and hasattr(pl_module, "log_videos") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_videos) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + with torch.no_grad(), torch.autocast(enabled=self.enable_autocast, device_type="cuda"): + videos = pl_module.log_videos( + batch, split=split, **self.log_videos_kwargs + ) + + for k in videos: + N = min(videos[k].shape[0], self.max_samples) + videos[k] = videos[k][:N] + if isinstance(videos[k], torch.Tensor): + videos[k] = videos[k].detach().float().cpu() + if self.clamp: + videos[k] = torch.clamp(videos[k], -1.0, 1.0) + + self.log_vid_local( + pl_module.logger.save_dir, + split, + videos, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + def check_frequency(self, check_idx): + if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( + check_idx > 0 or self.log_first_step + ): + try: + self.log_steps.pop(0) + except IndexError as e: + pass + return True + return False + + def check_frequency_val(self, check_idx, split): + if 'val' in split: + if ((check_idx % self.batch_freq_val) == 0) and ( + check_idx > 0 or self.log_first_step): + return True + return False + + @rank_zero_only + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): + self.log_img(pl_module, batch, batch_idx, split="train") + self.log_vid(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + if self.log_before_first_step and pl_module.global_step == 0: + self.log_img(pl_module, batch, batch_idx, split="train") + self.log_vid(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs + ): + if not self.disabled and pl_module.global_step > 0: + self.log_img(pl_module, batch, batch_idx, split="val") + self.log_vid(pl_module, batch, batch_idx, split="val") + if hasattr(pl_module, "calibrate_grad_norm"): + if ( + pl_module.calibrate_grad_norm and batch_idx % 25 == 0 + ) and batch_idx > 0: + self.log_gradients(trainer, pl_module, batch_idx=batch_idx) + + +def save_numpy_as_gif(frames, path, duration=None): + """ + save numpy array as gif file + """ + image_list = [] + for frame in frames: + image = frame.transpose(1, 2, 0) + image_list.append(image) + if duration: + imageio.mimsave(path, image_list, format="GIF", duration=duration, loop=0) + else: + imageio.mimsave(path, image_list, format="GIF", loop=0) diff --git a/Meissonic/VidTok/vidtok/modules/losses.py b/Meissonic/VidTok/vidtok/modules/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..c96f471c72124c56025b40a17f7d7bda81446ca5 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/losses.py @@ -0,0 +1,262 @@ +from typing import Any, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .discriminator import (NLayerDiscriminator, NLayerDiscriminator3D, + weights_init) +from .lpips import LPIPS +from .util import default, print0 + + +def hinge_d_loss(logits_real, logits_fake): + loss_real = torch.mean(F.relu(1.0 - logits_real)) + loss_fake = torch.mean(F.relu(1.0 + logits_fake)) + d_loss = 0.5 * (loss_real + loss_fake) + return d_loss + + +def vanilla_d_loss(logits_real, logits_fake): + d_loss = 0.5 * (torch.mean(F.softplus(-logits_real)) + torch.mean(F.softplus(logits_fake))) + return d_loss + + +def adopt_weight(weight, global_step, threshold=0, value=0.0): + if global_step < threshold: + weight = value + return weight + + +def _sigmoid_cross_entropy_with_logits(labels, logits): + """ + non-saturating loss + """ + zeros = torch.zeros_like(logits, dtype=logits.dtype) + condition = logits >= zeros + relu_logits = torch.where(condition, logits, zeros) + neg_abs_logits = torch.where(condition, -logits, logits) + return relu_logits - logits * labels + torch.log1p(torch.exp(neg_abs_logits)) + + +def non_saturate_gen_loss(logits_fake): + """ + logits_fake: [B 1 H W] + """ + B = logits_fake.shape[0] + logits_fake = logits_fake.reshape(B, -1) + logits_fake = torch.mean(logits_fake, dim=-1) + gen_loss = torch.mean(_sigmoid_cross_entropy_with_logits(labels=torch.ones_like(logits_fake), logits=logits_fake)) + return gen_loss + + +def lecam_reg(real_pred, fake_pred, lecam_ema): + reg = torch.mean(F.relu(real_pred - lecam_ema.logits_fake_ema).pow(2)) + torch.mean( + F.relu(lecam_ema.logits_real_ema - fake_pred).pow(2) + ) + return reg + + +class LeCAM_EMA(object): + # https://github.com/TencentARC/SEED-Voken/blob/main/src/Open_MAGVIT2/modules/losses/vqperceptual.py + def __init__(self, init=0.0, decay=0.999): + self.logits_real_ema = init + self.logits_fake_ema = init + self.decay = decay + + def update(self, logits_real, logits_fake): + self.logits_real_ema = self.logits_real_ema * self.decay + torch.mean(logits_real).item() * (1 - self.decay) + self.logits_fake_ema = self.logits_fake_ema * self.decay + torch.mean(logits_fake).item() * (1 - self.decay) + + +class GeneralLPIPSWithDiscriminator(nn.Module): + def __init__( + self, + disc_start: int, + logvar_init: float = 0.0, + pixelloss_weight=1.0, + disc_num_layers: int = 3, + disc_in_channels: int = 3, + disc_factor: float = 1.0, + disc_weight: float = 1.0, + disc_type: str = "3d", + perceptual_weight: float = 1.0, + lecam_loss_weight: float = 0.0, + disc_loss: str = "hinge", + scale_input_to_tgt_size: bool = False, + dims: int = 2, + learn_logvar: bool = False, + regularization_weights: Union[None, dict] = None, + gen_loss_cross_entropy: bool = False, + ): + super().__init__() + self.dims = dims + if self.dims > 2: + print0( + f"[bold cyan]\[vidtok.modules.losses][GeneralLPIPSWithDiscriminator][/bold cyan] running with dims={dims}. This means that for perceptual loss calculation, " + f"the LPIPS loss will be applied to each frame independently. " + ) + self.scale_input_to_tgt_size = scale_input_to_tgt_size + assert disc_loss in ["hinge", "vanilla"] + self.pixel_weight = pixelloss_weight + self.perceptual_loss = LPIPS().eval() + self.perceptual_weight = perceptual_weight + # output log variance + self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) + self.learn_logvar = learn_logvar + self.disc_type = disc_type + assert self.disc_type in ["2d", "3d"] + + if self.disc_type == "2d": + self.discriminator = NLayerDiscriminator( + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False + ).apply(weights_init) + else: + self.discriminator = NLayerDiscriminator3D( + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False + ).apply(weights_init) + self.discriminator_iter_start = disc_start + self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss + self.disc_factor = disc_factor + self.discriminator_weight = disc_weight + self.regularization_weights = default(regularization_weights, {}) + self.gen_loss_cross_entropy = gen_loss_cross_entropy + self.lecam_loss_weight = lecam_loss_weight + if self.lecam_loss_weight > 0: + self.lecam_ema = LeCAM_EMA() + + def get_trainable_parameters(self) -> Any: + return self.discriminator.parameters() + + def get_trainable_autoencoder_parameters(self) -> Any: + if self.learn_logvar: + yield self.logvar + yield from () + + def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): + if last_layer is not None: + nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] + g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] + else: + nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] + g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] + + d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) + d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() + d_weight = d_weight * self.discriminator_weight + return d_weight + + def forward( + self, + regularization_log, + inputs, + reconstructions, + optimizer_idx, + global_step, + last_layer=None, + split="train", + weights=None, + ): + if self.scale_input_to_tgt_size: + inputs = torch.nn.functional.interpolate(inputs, reconstructions.shape[2:], mode="bicubic", antialias=True) + + if optimizer_idx == 0: + bs = inputs.shape[0] + t = inputs.shape[2] + if self.dims > 2: + inputs, reconstructions = map( + lambda x: rearrange(x, "b c t h w -> (b t) c h w"), + (inputs, reconstructions), + ) + + rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) + if self.perceptual_weight > 0: + p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) + rec_loss = rec_loss + self.perceptual_weight * p_loss + else: + p_loss = torch.Tensor([0.0]) + + nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar + weighted_nll_loss = nll_loss + if weights is not None: + weighted_nll_loss = weights * nll_loss + weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] + nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] + + # now the GAN part + if self.disc_type == "3d": + reconstructions = rearrange(reconstructions, "(b t) c h w -> b c t h w", t=t).contiguous() + + # generator update + logits_fake = self.discriminator(reconstructions) + + if not self.gen_loss_cross_entropy: + g_loss = -torch.mean(logits_fake) + else: + g_loss = non_saturate_gen_loss(logits_fake) + + if self.disc_factor > 0.0: + try: + d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) + except RuntimeError: + assert not self.training + d_weight = torch.tensor(0.0) + else: + d_weight = torch.tensor(0.0) + + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + loss = weighted_nll_loss + d_weight * disc_factor * g_loss + log = dict() + for k in regularization_log: + if k in self.regularization_weights: + loss = loss + self.regularization_weights[k] * regularization_log[k] + log[f"{split}/{k}"] = regularization_log[k].detach().mean() + + log.update( + { + "{}/total_loss".format(split): loss.clone().detach().mean(), + "{}/logvar".format(split): self.logvar.detach(), + "{}/nll_loss".format(split): nll_loss.detach().mean(), + "{}/rec_loss".format(split): rec_loss.detach().mean(), + "{}/p_loss".format(split): p_loss.detach().mean(), + "{}/d_weight".format(split): d_weight.detach(), + "{}/disc_factor".format(split): torch.tensor(disc_factor), + "{}/g_loss".format(split): g_loss.detach().mean(), + } + ) + return loss, log + + if optimizer_idx == 1: + if self.disc_type == "2d" and self.dims > 2: + inputs, reconstructions = map( + lambda x: rearrange(x, "b c t h w -> (b t) c h w"), + (inputs, reconstructions), + ) + + logits_real = self.discriminator(inputs.contiguous().detach()) + logits_fake = self.discriminator(reconstructions.contiguous().detach()) + + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + + non_saturate_d_loss = self.disc_loss(logits_real, logits_fake) + + if self.lecam_loss_weight > 0: + self.lecam_ema.update(logits_real, logits_fake) + lecam_loss = lecam_reg(logits_real, logits_fake, self.lecam_ema) + d_loss = disc_factor * (lecam_loss * self.lecam_loss_weight + non_saturate_d_loss) + else: + d_loss = disc_factor * non_saturate_d_loss + + log = { + "{}/disc_loss".format(split): d_loss.clone().detach().mean(), + "{}/logits_real".format(split): logits_real.detach().mean(), + "{}/logits_fake".format(split): logits_fake.detach().mean(), + "{}/disc_factor".format(split): torch.tensor(disc_factor), + "{}/non_saturated_d_loss".format(split): non_saturate_d_loss.detach(), + } + + if self.lecam_loss_weight > 0: + log.update({"{}/lecam_loss".format(split): lecam_loss.detach()}) + + return d_loss, log diff --git a/Meissonic/VidTok/vidtok/modules/lpips.py b/Meissonic/VidTok/vidtok/modules/lpips.py new file mode 100644 index 0000000000000000000000000000000000000000..22bb5fa315618c0d1f0463d67ea57c29083fe302 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/lpips.py @@ -0,0 +1,172 @@ +import hashlib +import os +from collections import namedtuple +from tqdm import tqdm + +import requests +import torch +import torch.nn as nn +from torchvision import models + +from .util import print0 + +URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"} + +CKPT_MAP = {"vgg_lpips": "vgg.pth"} + +MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"} + + +def download(url, local_path, chunk_size=1024): + os.makedirs(os.path.split(local_path)[0], exist_ok=True) + with requests.get(url, stream=True) as r: + total_size = int(r.headers.get("content-length", 0)) + with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: + with open(local_path, "wb") as f: + for data in r.iter_content(chunk_size=chunk_size): + if data: + f.write(data) + pbar.update(chunk_size) + + +def md5_hash(path): + with open(path, "rb") as f: + content = f.read() + return hashlib.md5(content).hexdigest() + + +def get_ckpt_path(name, root, check=False): + assert name in URL_MAP + path = os.path.join(root, CKPT_MAP[name]) + if os.path.exists(path) and not (check and not md5_hash(path) == MD5_MAP[name]): + print0( + "[bold cyan]\[vidtok.modules.lpips]\[get_ckpt_path][/bold cyan] Using existing path for {} model: {}".format( + name, path + ) + ) + return path + + # if not, download the model + print0( + "[bold cyan]\[vidtok.modules.lpips]\[get_ckpt_path][/bold cyan] Downloading {} model from {} to {}".format( + name, URL_MAP[name], path + ) + ) + download(URL_MAP[name], path) + md5 = md5_hash(path) + assert md5 == MD5_MAP[name], md5 + return path + + +class LPIPS(nn.Module): + # Learned perceptual metric + def __init__(self, use_dropout=True): + super().__init__() + self.scaling_layer = ScalingLayer() + self.chns = [64, 128, 256, 512, 512] # vg16 features + self.net = vgg16(pretrained=True, requires_grad=False) + self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) + self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) + self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) + self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) + self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) + self.load_from_pretrained() + for param in self.parameters(): + param.requires_grad = False + + def load_from_pretrained(self, name="vgg_lpips"): + ckpt = get_ckpt_path(name, "checkpoints/lpips") + self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False) + print0("[bold cyan]\[vidtok.modules.lpips][LPIPS][/bold cyan] loaded pretrained LPIPS loss from {}".format(ckpt)) + + def forward(self, input, target): + in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target)) + outs0, outs1 = self.net(in0_input), self.net(in1_input) + feats0, feats1, diffs = {}, {}, {} + lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] + for kk in range(len(self.chns)): + feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) + diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 + + res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))] + val = res[0] + for l in range(1, len(self.chns)): + val += res[l] + return val + + +class ScalingLayer(nn.Module): + def __init__(self): + super(ScalingLayer, self).__init__() + self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]) + self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]) + + def forward(self, inp): + return (inp - self.shift) / self.scale + + +class NetLinLayer(nn.Module): + """A single linear layer which does a 1x1 conv""" + + def __init__(self, chn_in, chn_out=1, use_dropout=False): + super(NetLinLayer, self).__init__() + layers = ( + [ + nn.Dropout(), + ] + if (use_dropout) + else [] + ) + layers += [ + nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), + ] + self.model = nn.Sequential(*layers) + + +class vgg16(torch.nn.Module): + def __init__(self, requires_grad=False, pretrained=True): + super(vgg16, self).__init__() + vgg_pretrained_features = models.vgg16(pretrained=pretrained).features + self.slice1 = torch.nn.Sequential() + self.slice2 = torch.nn.Sequential() + self.slice3 = torch.nn.Sequential() + self.slice4 = torch.nn.Sequential() + self.slice5 = torch.nn.Sequential() + self.N_slices = 5 + for x in range(4): + self.slice1.add_module(str(x), vgg_pretrained_features[x]) + for x in range(4, 9): + self.slice2.add_module(str(x), vgg_pretrained_features[x]) + for x in range(9, 16): + self.slice3.add_module(str(x), vgg_pretrained_features[x]) + for x in range(16, 23): + self.slice4.add_module(str(x), vgg_pretrained_features[x]) + for x in range(23, 30): + self.slice5.add_module(str(x), vgg_pretrained_features[x]) + if not requires_grad: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, X): + h = self.slice1(X) + h_relu1_2 = h + h = self.slice2(h) + h_relu2_2 = h + h = self.slice3(h) + h_relu3_3 = h + h = self.slice4(h) + h_relu4_3 = h + h = self.slice5(h) + h_relu5_3 = h + vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) + out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) + return out + + +def normalize_tensor(x, eps=1e-10): + norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True)) + return x / (norm_factor + eps) + + +def spatial_average(x, keepdim=True): + return x.mean([2, 3], keepdim=keepdim) diff --git a/Meissonic/VidTok/vidtok/modules/model_3dcausal.py b/Meissonic/VidTok/vidtok/modules/model_3dcausal.py new file mode 100644 index 0000000000000000000000000000000000000000..c71a8b09e370451046eef9ba60315feab2459e35 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/model_3dcausal.py @@ -0,0 +1,885 @@ +from typing import Callable +from beartype import beartype +from beartype.typing import Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .util import checkpoint + + +def spatial_temporal_resblk(x, block_s, block_t, temb): + assert len(x.shape) == 5, "input should be 5D tensor, but got {}D tensor".format(len(x.shape)) + B, C, T, H, W = x.shape + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + x = block_s(x, temb) + x = einops.rearrange(x, "(b t) c h w -> b c t h w", b=B, t=T) + x = einops.rearrange(x, "b c t h w -> (b h w) c t") + x = block_t(x, temb) + x = einops.rearrange(x, "(b h w) c t -> b c t h w", b=B, h=H, w=W) + return x + + +def nonlinearity(x): + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32, norm_type="groupnorm"): + if norm_type == "groupnorm": + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + elif norm_type == "layernorm": + return LayerNorm(num_channels=in_channels, eps=1e-6) + + +def pad_at_dim(t, pad, dim=-1, pad_mode="constant", value=0.0): + assert pad_mode in ["constant", "replicate", "reflect"] + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + if pad_mode == "constant": + return F.pad(t, (*zeros, *pad), value=value) + return F.pad(t, (*zeros, *pad), mode=pad_mode) + + +def divisible_by(num, den): + return (num % den) == 0 + + +def is_odd(n): + return not divisible_by(n, 2) + + +def cast_tuple(t, length=1): + return t if isinstance(t, tuple) else ((t,) * length) + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class LayerNorm(nn.Module): + def __init__(self, num_channels, eps=1e-6, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True) + + def forward(self, x): + if x.dim() == 5: + x = rearrange(x, "b c t h w -> b t h w c") + x = self.norm(x) + x = rearrange(x, "b t h w c -> b c t h w") + elif x.dim() == 4: + x = rearrange(x, "b c h w -> b h w c") + x = self.norm(x) + x = rearrange(x, "b h w c -> b c h w") + else: + x = rearrange(x, "b c s -> b s c") + x = self.norm(x) + x = rearrange(x, "b s c -> b c s") + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__() + self.in_channels = in_channels + self.norm_type = norm_type + + self.norm = Normalize(in_channels, norm_type=self.norm_type) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x, **kwargs): + if self.use_checkpoint: + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x) + + def _forward(self, x, **kwargs): + h_ = x + h_ = self.attention(h_) + h_ = self.proj_out(h_) + return x + h_ + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + B = h_.shape[0] + h_ = rearrange(h_, "b c t h w -> (b t) c h w") + h_ = self.norm(h_) + h_ = rearrange(h_, "(b t) c h w -> b c t h w", b=B) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class CausalConv1d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: int, pad_mode="constant", **kwargs): + super().__init__() + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + self.pad_mode = pad_mode + self.time_pad = dilation * (kernel_size - 1) + (1 - stride) + self.time_causal_padding = (self.time_pad, 0) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant" + x = F.pad(x, self.time_causal_padding, mode=pad_mode) + return self.conv(x) + + +class CausalConv3d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", **kwargs): + super().__init__() + kernel_size = cast_tuple(kernel_size, 3) + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + dilation = cast_tuple(dilation, 3) + stride = cast_tuple(stride, 3) + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + assert is_odd(height_kernel_size) and is_odd(width_kernel_size) + + self.pad_mode = pad_mode + time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0]) + height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1]) + width_pad = dilation[2] * (height_kernel_size - 1) + (1 - stride[2]) + + self.time_pad = time_pad + self.time_causal_padding = ( + width_pad // 2, + width_pad - width_pad // 2, + height_pad // 2, + height_pad - height_pad // 2, + time_pad, + 0, + ) + + self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant" + + x = F.pad(x, self.time_causal_padding, mode=pad_mode) + return self.conv(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = CausalConv3d(in_channels, out_channels, 3, stride=(2, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 1, 0) + x1 = self.avg_pool(torch.nn.functional.pad(x, pad, mode="constant", value=0)) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.conv = CausalConv3d(in_channels, out_channels, 3) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode="nearest").to( + x.dtype + ) + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetCausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv3d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + else: + self.nin_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + ) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + return x + h + + +class ResnetCausalBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv1d(out_channels, out_channels, kernel_size=3, stride=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + else: + self.nin_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=1, stride=1) + + if zero_init: + self.conv2.conv.weight.data.zero_() + self.conv2.conv.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class EncoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_ds=None, + tempo_ds=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.norm_type = norm_type + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.is_causal = True + + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetCausalBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level in self.spatial_ds: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleResCausal2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls(block_in, norm_type=self.norm_type) + + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + ) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def forward(self, x): + temb = None + B, _, T, H, W = x.shape + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + + if i_level in self.spatial_ds: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal downsample + B, _, T, H, W = htmp.shape + if i_level in self.tempo_ds: + htmp = self.down_temporal[i_level].downsample(htmp) + + hs.append(htmp) + B, _, T, H, W = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = nonlinearity(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv_out(h) + + return h + + +class EncoderCausal3DPadding(EncoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.init_pad_mode = ignore_kwargs.get("init_pad_mode", "replicate") + self.time_padding = self.time_downsample_factor - 1 + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + video_len = x.shape[2] + if video_len % self.time_downsample_factor != 0: + x = pad_at_dim(x, (self.time_padding, 0), dim=2, pad_mode=self.init_pad_mode, value=0.0) + return super().forward(x) + + +class DecoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_us=None, + tempo_us=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.norm_type = norm_type + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us + self.tempo_us = [1, 2] if tempo_us is None else tempo_us + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.spatial_us: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetCausalBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleResCausal2x(block_in, block_in) + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def get_last_layer(self, **kwargs): + try: + return self.conv_out.conv.weight + except: + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, H, W = z.shape + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + + if i_level in self.spatial_us: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal upsample + B, _, T, H, W = h.shape + if i_level in self.tempo_us: + h = self.up_temporal[i_level].upsample(h) + B, _, T, H, W = h.shape + + # end + if self.give_pre_end: + return h + + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + + if self.tanh_out: + h = torch.tanh(h) + + return h + + +class DecoderCausal3DPadding(DecoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.time_padding = self.time_downsample_factor - 1 + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + x = super().forward(x) + return x[:, :, self.time_padding :, :, :] diff --git a/Meissonic/VidTok/vidtok/modules/model_3dcausal_v1_1.py b/Meissonic/VidTok/vidtok/modules/model_3dcausal_v1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..44397a135334a58cf5774fb899152f701d56a37d --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/model_3dcausal_v1_1.py @@ -0,0 +1,959 @@ +from typing import Callable +from beartype import beartype +from beartype.typing import Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .util import checkpoint + + +def spatial_temporal_resblk(x, block_s, block_t, temb): + assert len(x.shape) == 5, "input should be 5D tensor, but got {}D tensor".format(len(x.shape)) + B, C, T, H, W = x.shape + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + x = block_s(x, temb) + x = einops.rearrange(x, "(b t) c h w -> b c t h w", b=B, t=T) + x = einops.rearrange(x, "b c t h w -> (b h w) c t") + x = block_t(x, temb) + x = einops.rearrange(x, "(b h w) c t -> b c t h w", b=B, h=H, w=W) + return x + + +def nonlinearity(x): + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32, norm_type="groupnorm"): + if norm_type == "groupnorm": + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + elif norm_type == "layernorm": + return LayerNorm(num_channels=in_channels, eps=1e-6) + + +def pad_at_dim(t, pad, dim=-1, pad_mode="constant", value=0.0): + assert pad_mode in ["constant", "replicate", "reflect"] + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + if pad_mode == "constant": + return F.pad(t, (*zeros, *pad), value=value) + return F.pad(t, (*zeros, *pad), mode=pad_mode) + + +def divisible_by(num, den): + return (num % den) == 0 + + +def is_odd(n): + return not divisible_by(n, 2) + + +def cast_tuple(t, length=1): + return t if isinstance(t, tuple) else ((t,) * length) + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class LayerNorm(nn.Module): + def __init__(self, num_channels, eps=1e-6, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True) + + def forward(self, x): + if x.dim() == 5: + x = rearrange(x, "b c t h w -> b t h w c") + x = self.norm(x) + x = rearrange(x, "b t h w c -> b c t h w") + elif x.dim() == 4: + x = rearrange(x, "b c h w -> b h w c") + x = self.norm(x) + x = rearrange(x, "b h w c -> b c h w") + else: + x = rearrange(x, "b c s -> b s c") + x = self.norm(x) + x = rearrange(x, "b s c -> b c s") + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__() + self.in_channels = in_channels + self.norm_type = norm_type + + self.norm = Normalize(in_channels, norm_type=self.norm_type) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x, **kwargs): + if self.use_checkpoint: + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x) + + def _forward(self, x, **kwargs): + h_ = x + h_ = self.attention(h_) + h_ = self.proj_out(h_) + return x + h_ + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + B = h_.shape[0] + h_ = rearrange(h_, "b c t h w -> (b t) c h w") + h_ = self.norm(h_) + h_ = rearrange(h_, "(b t) c h w -> b c t h w", b=B) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class CausalConv1d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: int, pad_mode="constant", **kwargs): + super().__init__() + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + self.pad_mode = pad_mode + self.time_pad = dilation * (kernel_size - 1) + (1 - stride) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + self.is_first_chunk = True + self.causal_cache = None + self.cache_offset = 0 + + def forward(self, x): + if self.is_first_chunk: + first_frame_pad = x[:, :, :1].repeat( + (1, 1, self.time_pad) + ) + else: + first_frame_pad = self.causal_cache + if self.time_pad != 0: + first_frame_pad = first_frame_pad[:, :, -self.time_pad:] + else: + first_frame_pad = first_frame_pad[:, :, 0:0] + + x = torch.concatenate((first_frame_pad, x), dim=2) + + if self.cache_offset == 0: + self.causal_cache = x.clone() + else: + self.causal_cache = x[:,:,:-self.cache_offset].clone() + + return self.conv(x) + + +class CausalConv3d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", **kwargs): + super().__init__() + kernel_size = cast_tuple(kernel_size, 3) + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + dilation = cast_tuple(dilation, 3) + stride = cast_tuple(stride, 3) + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + assert is_odd(height_kernel_size) and is_odd(width_kernel_size) + + self.pad_mode = pad_mode + time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0]) + height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1]) + width_pad = dilation[2] * (width_kernel_size - 1) + (1 - stride[2]) + + self.time_pad = time_pad + self.spatial_padding = ( + width_pad // 2, + width_pad - width_pad // 2, + height_pad // 2, + height_pad - height_pad // 2, + 0, + 0, + ) + + self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + self.is_first_chunk = True + self.causal_cache = None + self.cache_offset = 0 + + def forward(self, x): + if self.is_first_chunk: + first_frame_pad = x[:, :, :1, :, :].repeat( + (1, 1, self.time_pad, 1, 1) + ) + else: + first_frame_pad = self.causal_cache + if self.time_pad != 0: + first_frame_pad = first_frame_pad[:, :, -self.time_pad:] + else: + first_frame_pad = first_frame_pad[:, :, 0:0] + + x = torch.concatenate((first_frame_pad, x), dim=2) + + if self.cache_offset == 0: + self.causal_cache = x.clone() + else: + self.causal_cache = x[:,:,:-self.cache_offset].clone() + + x = F.pad(x, self.spatial_padding, mode=self.pad_mode) + return self.conv(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = CausalConv3d(in_channels, out_channels, 3, stride=(2, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + self.is_first_chunk = True + self.causal_cache = None + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 1, 0) + + if self.is_first_chunk: + x_pad = torch.nn.functional.pad(x, pad, mode="replicate") + else: + x_pad = torch.concatenate((self.causal_cache, x), dim=2) + + self.causal_cache = x_pad[:,:,-1:].clone() + + x1 = self.avg_pool(x_pad) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + interpolation_mode='nearest', + num_temp_upsample=1 + ): + super().__init__() + self.conv = CausalConv3d(in_channels, out_channels, 3) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + self.interpolation_mode = interpolation_mode + self.num_temp_upsample = num_temp_upsample + self.enable_cached = (self.interpolation_mode == 'trilinear') + self.is_first_chunk = True + self.causal_cache = None + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + if not self.enable_cached: + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + elif not self.is_first_chunk: + x = torch.cat([self.causal_cache, x], dim=2) + self.causal_cache = x[:, :, -2*self.num_temp_upsample:-self.num_temp_upsample].clone() + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + x = x[:, :, 2*self.num_temp_upsample:] + else: + self.causal_cache = x[:, :, -self.num_temp_upsample:].clone() + x, _x = x[:, :, :self.num_temp_upsample], x[:, :, self.num_temp_upsample:] + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + if _x.shape[-3] > 0: + _x = F.interpolate(_x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(_x.dtype) + x = torch.concat([x, _x], dim=2) + + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetCausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv3d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + else: + self.nin_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + ) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + return x + h + + +class ResnetCausalBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv1d(out_channels, out_channels, kernel_size=3, stride=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + else: + self.nin_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=1, stride=1) + + if zero_init: + self.conv2.conv.weight.data.zero_() + self.conv2.conv.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class EncoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_ds=None, + tempo_ds=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.norm_type = norm_type + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.is_causal = True + + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetCausalBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level in self.spatial_ds: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleResCausal2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls(block_in, norm_type=self.norm_type) + + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + ) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def forward(self, x): + temb = None + B, _, T, H, W = x.shape + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + + if i_level in self.spatial_ds: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal downsample + B, _, T, H, W = htmp.shape + if i_level in self.tempo_ds: + htmp = self.down_temporal[i_level].downsample(htmp) + + hs.append(htmp) + B, _, T, H, W = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = nonlinearity(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv_out(h) + + return h + + +class EncoderCausal3DPadding(EncoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.init_pad_mode = ignore_kwargs.get("init_pad_mode", "replicate") + + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + video_len = x.shape[2] + if video_len % self.time_downsample_factor != 0: + time_padding = self.time_downsample_factor - video_len % self.time_downsample_factor + x = pad_at_dim(x, (time_padding, 0), dim=2, pad_mode=self.init_pad_mode, value=0.0) + return super().forward(x) + + +class DecoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_us=None, + tempo_us=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.norm_type = norm_type + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + self.interpolation_mode = ignorekwargs.get("interpolation_mode", 'nearest') + assert self.interpolation_mode in ['nearest', 'trilinear'] + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us + self.tempo_us = [1, 2] if tempo_us is None else tempo_us + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.spatial_us: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + num_temp_upsample = 1 + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetCausalBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleResCausal2x(block_in, block_in, interpolation_mode=self.interpolation_mode, num_temp_upsample=num_temp_upsample) + num_temp_upsample *= 2 + + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def get_last_layer(self, **kwargs): + try: + return self.conv_out.conv.weight + except: + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, H, W = z.shape + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + + if i_level in self.spatial_us: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal upsample + B, _, T, H, W = h.shape + if i_level in self.tempo_us: + h = self.up_temporal[i_level].upsample(h) + B, _, T, H, W = h.shape + + # end + if self.give_pre_end: + return h + + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + + if self.tanh_out: + h = torch.tanh(h) + + return h + + +class DecoderCausal3DPadding(DecoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + x = super().forward(x) + return x diff --git a/Meissonic/VidTok/vidtok/modules/model_3dnoncausal.py b/Meissonic/VidTok/vidtok/modules/model_3dnoncausal.py new file mode 100644 index 0000000000000000000000000000000000000000..4223fb635eb88cd7e7292943c131965f2b814206 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/model_3dnoncausal.py @@ -0,0 +1,652 @@ +from typing import Callable + +import einops +import torch +import torch.nn as nn +from einops import rearrange + +from .model_3dcausal import (AttnBlock, Normalize, nonlinearity, + spatial_temporal_resblk) +from .util import checkpoint + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleRes2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = nn.Conv3d(in_channels, out_channels, 3, stride=(2, 1, 1), padding=(0, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x1 = self.avg_pool(x) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleRes2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.conv = nn.Conv3d(in_channels, out_channels, 3, padding=1) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + xlst = [ + torch.nn.functional.interpolate( + sx.unsqueeze(0).to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode="nearest" + ).to(x.dtype) + for sx in x + ] + x = torch.cat(xlst, dim=0) + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + if zero_init: + self.conv2.weight.data.zero_() + self.conv2.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetNoncausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = nn.Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = nn.Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=1) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class Encoder3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch=8, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] + self.spatial_ds = list(range(0, self.num_resolutions - 1)) # add for spatial tiling + self.norm_type = norm_type + self.is_causal = False + + # downsampling + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleRes2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn(block_in, norm_type=self.norm_type) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetNoncausalBlock + + def _make_conv(self) -> Callable: + return nn.Conv3d + + def forward(self, x): + temb = None + B, _, T, _, _ = x.shape + + # downsampling + if x.shape[1] == 4 and self.conv_in.in_channels == 3: + raise ValueError("Mismatched number of input channels") + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + if i_level != self.num_resolutions - 1: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + if i_level in self.tempo_ds: + # temporal downsample + htmp = self.down_temporal[i_level].downsample(htmp) + hs.append(htmp) + B, _, T, _, _ = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels=8, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + self.tempo_us = [1, 2] + self.norm_type = norm_type + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleRes2x(block_in, block_in) + + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1, padding=1) + + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetNoncausalBlock + + def _make_conv(self) -> Callable: + return nn.Conv3d + + def get_last_layer(self, **kwargs): + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, _, _ = z.shape + + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + if i_level != 0: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + if i_level in self.tempo_us: + # temporal upsample + h = self.up_temporal[i_level].upsample(h) + B, _, T, _, _ = h.shape + # end + if self.give_pre_end: + return h + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + if self.tanh_out: + h = torch.tanh(h) + return h diff --git a/Meissonic/VidTok/vidtok/modules/regularizers.py b/Meissonic/VidTok/vidtok/modules/regularizers.py new file mode 100644 index 0000000000000000000000000000000000000000..4f4f1a4fccdb873f640e637ecbaaaef389e75b41 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/regularizers.py @@ -0,0 +1,268 @@ +from abc import abstractmethod +from functools import cache +from typing import Any, List, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from einops import pack, rearrange, reduce, unpack +from torch import Tensor, int32 +from torch.cuda.amp import autocast + +from .distributions import DiagonalGaussianDistribution + + +def exists(v): + return v is not None + + +def default(*args): + for arg in args: + if exists(arg): + return arg + return None + + +def pack_one(t, pattern): + return pack([t], pattern) + + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + + +def round_ste(z: Tensor) -> Tensor: + """Round with straight through gradients.""" + zhat = z.round() + return z + (zhat - z).detach() + + +def log(t, eps=1e-5): + return t.clamp(min=eps).log() + + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + dist.all_reduce(t) + t = t / dist.get_world_size() + return t + + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + + +class AbstractRegularizer(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]: + raise NotImplementedError() + + @abstractmethod + def get_trainable_parameters(self) -> Any: + raise NotImplementedError() + + +class DiagonalGaussianRegularizer(AbstractRegularizer): + def __init__(self, sample: bool = True): + super().__init__() + self.sample = sample + + def get_trainable_parameters(self) -> Any: + yield from () + + def forward(self, z: torch.Tensor, n_steps=None) -> Tuple[torch.Tensor, dict]: + log = dict() + posterior = DiagonalGaussianDistribution(z) + if self.sample: + z = posterior.sample() + else: + z = posterior.mode() + kl_loss = posterior.kl() + kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] + log["kl_loss"] = kl_loss + return z, log + + +class FSQRegularizer(AbstractRegularizer): + # https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/finite_scalar_quantization.py + def __init__( + self, + levels: List[int], + dim: Optional[int] = None, + num_codebooks=1, + keep_num_codebooks_dim: Optional[bool] = None, + scale: Optional[float] = None, + entropy_loss_weight: float = 0.0, + entropy_loss_annealing_steps: int = 0, + entropy_loss_annealing_factor: float = 1.0, + commitment_loss_weight: float = 0.0, + diversity_gamma: float = 1.0, + ): + super().__init__() + _levels = torch.tensor(levels, dtype=int32) + self.register_buffer("_levels", _levels, persistent=False) + + _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32) + self.register_buffer("_basis", _basis, persistent=False) + + self.scale = scale + self.entropy_loss_weight = entropy_loss_weight + self.entropy_loss_annealing_steps = entropy_loss_annealing_steps + self.entropy_loss_annealing_factor = entropy_loss_annealing_factor + self.commitment_loss_weight = commitment_loss_weight + self.diversity_gamma = diversity_gamma + + codebook_dim = len(levels) + self.codebook_dim = codebook_dim + + effective_codebook_dim = codebook_dim * num_codebooks + self.num_codebooks = num_codebooks + self.effective_codebook_dim = effective_codebook_dim + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + self.dim = default(dim, len(_levels) * num_codebooks) + + has_projections = self.dim != effective_codebook_dim + self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity() + self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity() + self.has_projections = has_projections + + self.codebook_size = self._levels.prod().item() + + implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out=False) + self.register_buffer("implicit_codebook", implicit_codebook, persistent=False) + self.register_buffer("zero", torch.tensor(0.0), persistent=False) + + self.global_codebook_usage = torch.zeros([2**self.codebook_dim, self.num_codebooks], dtype=torch.long) + + def get_trainable_parameters(self) -> Any: + return self.parameters() + + def bound(self, z: Tensor, eps: float = 1e-3) -> Tensor: + """Bound `z`, an array of shape (..., d).""" + half_l = (self._levels - 1) * (1 + eps) / 2 + offset = torch.where(self._levels % 2 == 0, 0.5, 0.0) + shift = (offset / half_l).atanh() + return (z + shift).tanh() * half_l - offset + + def quantize(self, z: Tensor) -> Tensor: + """Quantizes z, returns quantized zhat, same shape as z.""" + quantized = round_ste(self.bound(z)) + half_width = self._levels // 2 + return quantized / half_width + + def _scale_and_shift(self, zhat_normalized: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat_normalized * half_width) + half_width + + def _scale_and_shift_inverse(self, zhat: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat - half_width) / half_width + + def codes_to_indices(self, zhat: Tensor) -> Tensor: + """Converts a `code` to an index in the codebook.""" + assert zhat.shape[-1] == self.codebook_dim + zhat = self._scale_and_shift(zhat) + return (zhat * self._basis).sum(dim=-1).to(int32) + + def indices_to_codes(self, indices: Tensor, project_out=True) -> Tensor: + """Inverse of `codes_to_indices`.""" + + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + + indices = rearrange(indices, "... -> ... 1") + codes_non_centered = (indices // self._basis) % self._levels + codes = self._scale_and_shift_inverse(codes_non_centered) + + if self.keep_num_codebooks_dim: + codes = rearrange(codes, "... c d -> ... (c d)") + + if project_out: + codes = self.project_out(codes) + + if is_img_or_video: + codes = rearrange(codes, "b ... d -> b d ...") + + return codes + + def calculate_entropy_loss_weight(self, n_steps): + if n_steps >= self.entropy_loss_annealing_steps: + return self.entropy_loss_weight + start = self.entropy_loss_annealing_factor * self.entropy_loss_weight + return start - (n_steps / self.entropy_loss_annealing_steps) * (start - self.entropy_loss_weight) + + @autocast(enabled=False) + def forward(self, z: Tensor, inv_temperature: float = 100.0, n_steps: int = 0) -> Tensor: + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension + c - number of codebook dim + """ + is_img_or_video = z.ndim >= 4 + if is_img_or_video: + z = rearrange(z, "b d ... -> b ... d") + z, ps = pack_one(z, "b * d") + + assert z.shape[-1] == self.dim, f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}" + + z = self.project_in(z) + z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks) + + with torch.autocast("cuda", enabled=False): + orig_dtype = z.dtype + z = z.float() + original_input = z + codes = self.quantize(z) + indices = self.codes_to_indices(codes) + + if self.entropy_loss_weight > 0 or self.commitment_loss_weight > 0: + # the same as euclidean distance up to a constant + distance = -2 * torch.einsum("... i d, j d -> ... i j", original_input, self.implicit_codebook) + prob = (-distance * inv_temperature).softmax(dim=-1) + per_sample_probs = rearrange(prob, "b n ... -> (b n) ...") + per_sample_entropy = entropy(per_sample_probs).mean() + # distribution over all available tokens in the batch + avg_prob = reduce(per_sample_probs, "... c d -> c d", "mean") + avg_prob = maybe_distributed_mean(avg_prob) + codebook_entropy = entropy(avg_prob).mean() + entropy_aux_loss = per_sample_entropy - self.diversity_gamma * codebook_entropy + # commit loss + commit_loss = F.mse_loss(original_input, codes.detach(), reduction="none") + commit_loss = commit_loss.mean() + else: + entropy_aux_loss = per_sample_entropy = codebook_entropy = commit_loss = self.zero + + codes = codes.type(orig_dtype) + + codes = rearrange(codes, "b n c d -> b n (c d)") + out = self.project_out(codes) + + # reconstitute image or video dimensions + if is_img_or_video: + out = unpack_one(out, ps, "b * d") + out = rearrange(out, "b ... d -> b d ...") + + indices = unpack_one(indices, ps, "b * c") + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, "... 1 -> ...") + + aux_loss = ( + entropy_aux_loss * self.calculate_entropy_loss_weight(n_steps) + commit_loss * self.commitment_loss_weight + ) + + return out, dict(indices=indices, aux_loss=aux_loss) diff --git a/Meissonic/VidTok/vidtok/modules/util.py b/Meissonic/VidTok/vidtok/modules/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9570016221e47b50d536d30352b5718a29bb0009 --- /dev/null +++ b/Meissonic/VidTok/vidtok/modules/util.py @@ -0,0 +1,324 @@ +import importlib +import random +import os +import einops +import numpy as np +from inspect import isfunction +from rich import print +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from lightning.pytorch.utilities.rank_zero import rank_zero_only + + +def get_valid_dirs(dir1: str, dir2: str, dir3: Union[None, str] = None) -> Union[None, str]: + if (dir1 is not None) and os.path.isdir(dir1): + return dir1 + elif (dir2 is not None) and os.path.isdir(dir2): + return dir2 + elif (dir3 is not None) and os.path.isdir(dir3): + return dir3 + else: + return None + + +def get_valid_paths(path1: str, path2: str, path3: Union[None, str] = None) -> Union[None, str]: + if (path1 is not None) and os.path.isfile(path1): + return path1 + elif (path2 is not None) and os.path.isfile(path2): + return path2 + elif (path3 is not None) and os.path.isfile(path3): + return path3 + else: + return None + + +@rank_zero_only +def print0(*args, **kwargs): + print(*args, **kwargs) + + +def seed_anything(seed: int): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def isheatmap(x): + if not isinstance(x, torch.Tensor): + return False + + return x.ndim == 2 + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def instantiate_from_config(config): + if not "target" in config: + if config == "__is_first_stage__": + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + + +def get_obj_from_str(string, reload=False, invalidate_cache=True): + module, cls = string.rsplit(".", 1) + if invalidate_cache: + importlib.invalidate_caches() + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def checkpoint(func, inputs, params, flag): + # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(torch.autograd.Function): + # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + ctx.gpu_autocast_kwargs = { + "enabled": torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled(), + } + with torch.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] + # Ensure all tensors have requires_grad set to True + ctx.input_params = [p.requires_grad_(True) for p in ctx.input_params] + with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = torch.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads + + +def compute_psnr(x, y): + if x.dim() == 5: + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + assert y.dim() == 5 + y = einops.rearrange(y, "b c t h w -> (b t) c h w") + EPS = 1e-8 + mse = torch.mean((x - y) ** 2, dim=[1, 2, 3]) + psnr = -10 * torch.log10(mse + EPS) + return psnr.mean(dim=0) + + +def compute_ssim(x, y): + if x.dim() == 5: + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + assert y.dim() == 5 + y = einops.rearrange(y, "b c t h w -> (b t) c h w") + kernel_size = 11 + kernel_sigma = 1.5 + k1 = 0.01 + k2 = 0.03 + + f = max(1, round(min(x.size()[-2:]) / 256)) + if f > 1: + x = F.avg_pool2d(x, kernel_size=f) + y = F.avg_pool2d(y, kernel_size=f) + + kernel = gaussian_filter(kernel_size, kernel_sigma, device=x.device, dtype=x.dtype).repeat(x.size(1), 1, 1, 1) + + _compute_ssim_per_channel = _ssim_per_channel_complex if x.dim() == 5 else _ssim_per_channel + ssim_map, cs_map = _compute_ssim_per_channel(x=x, y=y, kernel=kernel, data_range=1, k1=k1, k2=k2) + ssim_val = ssim_map.mean(1) + + return ssim_val.mean(dim=0) + + +def _ssim_per_channel( + x: torch.Tensor, + y: torch.Tensor, + kernel: torch.Tensor, + data_range: Union[float, int] = 1.0, + k1: float = 0.01, + k2: float = 0.03, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Calculate Structural Similarity (SSIM) index for X and Y per channel. + + Args: + x: An input tensor. Shape :math:`(N, C, H, W)`. + y: A target tensor. Shape :math:`(N, C, H, W)`. + kernel: 2D Gaussian kernel. + data_range: Maximum value range of images (usually 1.0 or 255). + k1: Algorithm parameter, K1 (small constant, see [1]). + k2: Algorithm parameter, K2 (small constant, see [1]). + Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results. + + Returns: + Full Value of Structural Similarity (SSIM) index. + """ + if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): + raise ValueError( + f"Kernel size can't be greater than actual input size. " + f"Input size: {x.size()}. Kernel size: {kernel.size()}" + ) + + c1 = k1**2 + c2 = k2**2 + n_channels = x.size(1) + mu_x = F.conv2d(x, weight=kernel, stride=1, padding=0, groups=n_channels) + mu_y = F.conv2d(y, weight=kernel, stride=1, padding=0, groups=n_channels) + + mu_xx = mu_x**2 + mu_yy = mu_y**2 + mu_xy = mu_x * mu_y + + sigma_xx = F.conv2d(x**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xx + sigma_yy = F.conv2d(y**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_yy + sigma_xy = F.conv2d(x * y, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xy + + # Contrast sensitivity (CS) with alpha = beta = gamma = 1. + cs = (2.0 * sigma_xy + c2) / (sigma_xx + sigma_yy + c2) + + # Structural similarity (SSIM) + ss = (2.0 * mu_xy + c1) / (mu_xx + mu_yy + c1) * cs + + ssim_val = ss.mean(dim=(-1, -2)) + cs = cs.mean(dim=(-1, -2)) + return ssim_val, cs + + +def _ssim_per_channel_complex( + x: torch.Tensor, + y: torch.Tensor, + kernel: torch.Tensor, + data_range: Union[float, int] = 1.0, + k1: float = 0.01, + k2: float = 0.03, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel. + + Args: + x: An input tensor. Shape :math:`(N, C, H, W, 2)`. + y: A target tensor. Shape :math:`(N, C, H, W, 2)`. + kernel: 2-D gauss kernel. + data_range: Maximum value range of images (usually 1.0 or 255). + k1: Algorithm parameter, K1 (small constant, see [1]). + k2: Algorithm parameter, K2 (small constant, see [1]). + Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results. + + Returns: + Full Value of Complex Structural Similarity (SSIM) index. + """ + n_channels = x.size(1) + if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): + raise ValueError( + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " + f"Kernel size: {kernel.size()}" + ) + + c1 = k1**2 + c2 = k2**2 + + x_real = x[..., 0] + x_imag = x[..., 1] + y_real = y[..., 0] + y_imag = y[..., 1] + + mu1_real = F.conv2d(x_real, weight=kernel, stride=1, padding=0, groups=n_channels) + mu1_imag = F.conv2d(x_imag, weight=kernel, stride=1, padding=0, groups=n_channels) + mu2_real = F.conv2d(y_real, weight=kernel, stride=1, padding=0, groups=n_channels) + mu2_imag = F.conv2d(y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) + + mu1_sq = mu1_real.pow(2) + mu1_imag.pow(2) + mu2_sq = mu2_real.pow(2) + mu2_imag.pow(2) + mu1_mu2_real = mu1_real * mu2_real - mu1_imag * mu2_imag + mu1_mu2_imag = mu1_real * mu2_imag + mu1_imag * mu2_real + + compensation = 1.0 + + x_sq = x_real.pow(2) + x_imag.pow(2) + y_sq = y_real.pow(2) + y_imag.pow(2) + x_y_real = x_real * y_real - x_imag * y_imag + x_y_imag = x_real * y_imag + x_imag * y_real + + sigma1_sq = F.conv2d(x_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_sq + sigma2_sq = F.conv2d(y_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu2_sq + sigma12_real = F.conv2d(x_y_real, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_real + sigma12_imag = F.conv2d(x_y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_imag + sigma12 = torch.stack((sigma12_imag, sigma12_real), dim=-1) + mu1_mu2 = torch.stack((mu1_mu2_real, mu1_mu2_imag), dim=-1) + # Set alpha = beta = gamma = 1. + cs_map = (sigma12 * 2 + c2 * compensation) / (sigma1_sq.unsqueeze(-1) + sigma2_sq.unsqueeze(-1) + c2 * compensation) + ssim_map = (mu1_mu2 * 2 + c1 * compensation) / (mu1_sq.unsqueeze(-1) + mu2_sq.unsqueeze(-1) + c1 * compensation) + ssim_map = ssim_map * cs_map + + ssim_val = ssim_map.mean(dim=(-2, -3)) + cs = cs_map.mean(dim=(-2, -3)) + + return ssim_val, cs + + +def gaussian_filter( + kernel_size: int, sigma: float, device: Optional[str] = None, dtype: Optional[type] = None +) -> torch.Tensor: + r"""Returns 2D Gaussian kernel N(0,`sigma`^2) + Args: + kernel_size: Size of the kernel + sigma: Std of the distribution + device: target device for kernel generation + dtype: target data type for kernel generation + Returns: + gaussian_kernel: Tensor with shape (1, kernel_size, kernel_size) + """ + coords = torch.arange(kernel_size, dtype=dtype, device=device) + coords -= (kernel_size - 1) / 2.0 + + g = coords**2 + g = (-(g.unsqueeze(0) + g.unsqueeze(1)) / (2 * sigma**2)).exp() + + g /= g.sum() + return g.unsqueeze(0) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/.gitignore b/Meissonic/VidTok/vidtok_cache/VidTok/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..81849eac5c147c73d4202463658bd4b49571d163 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/.gitignore @@ -0,0 +1,167 @@ +amlt +.amltconfig +checkpoints +logs +wandb +tmp + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/CODE_OF_CONDUCT.md b/Meissonic/VidTok/vidtok_cache/VidTok/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..f9ba8cf65f3e3104dd061c178066ec8247811f33 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/LICENSE b/Meissonic/VidTok/vidtok_cache/VidTok/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9e841e7a26e4eb057b24511e7b92d42b257a80e5 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/README.md b/Meissonic/VidTok/vidtok_cache/VidTok/README.md new file mode 100644 index 0000000000000000000000000000000000000000..edd9df5e2319e6dab35604d0d67192381dfacabf --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/README.md @@ -0,0 +1,472 @@ +
+ +# VidTok
A Family of Versatile and State-Of-The-Art Video Tokenizers + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2412.13061)   [![GitHub](https://img.shields.io/badge/GitHub-Code-blue?logo=github&logoColor=white)](https://github.com/microsoft/VidTok)   [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/microsoft/VidTok) +
+ +--- + +![radar](assets/radar.png) + + We introduce VidTok, a cutting-edge family of video tokenizers that excels in both continuous and discrete tokenizations. VidTok incorporates several key advancements over existing approaches: + * ⚡️ **Efficient Architecture**. Separate spatial and temporal sampling reduces computational complexity without sacrificing quality. + * 🔥 **Advanced Quantization**. Finite Scalar Quantization (FSQ) addresses training instability and codebook collapse in discrete tokenization. + * 💥 **Enhanced Training**. A two-stage strategy—pre-training on low-res videos and fine-tuning on high-res—boosts efficiency. Reduced frame rates improve motion dynamics representation. + +VidTok, trained on a large-scale video dataset, outperforms previous models across all metrics, including PSNR, SSIM, LPIPS, and FVD. + +https://github.com/user-attachments/assets/a3341037-130d-4a83-aba6-c3daeaf66932 + +## 🔥 News +- August, 2025: 🚀 Introduced spatial tiling for large resolutions (>256), reducing GPU memory usage to ~6 GB when encoding and decoding a 17 × 768 × 768 video. +* March, 2025: 🚀 [VidTwin](https://github.com/microsoft/VidTok/tree/main/vidtwin) has been accepted by CVPR 2025, and the [checkpoint](https://huggingface.co/microsoft/vidtwin) was released! +* March, 2025: 🚀 [VidTok v1.1](#-updates-in-vidtok-v11) was released! We fine-tuned all causal models on long videos to support tokenization and reconstruction of videos of arbitrary length with fine temporal smoothness. [Relevant checkpoints](https://huggingface.co/microsoft/VidTok/tree/main/checkpoints/vidtok_v1_1) are continuously updating. +* December, 2024: 🚀 [VidTwin](https://github.com/microsoft/VidTok/tree/main/vidtwin) was released! +* December, 2024: 🚀 [VidTok](https://github.com/microsoft/vidtok) was released! + + +## 💥 Updates in VidTok v1.1 +> VidTok v1.1 is an update for causal models. We fine-tuned all causal models on long videos to support tokenization and reconstruction of videos of arbitrary length with fine temporal smoothness. See performance [here](#v11-performance). + +### v1.1: Long Video Reconstruction +Run the following inference script to [reconstruct an input video](#reconstruct-an-input-video): +```bash +python scripts/inference_reconstruct.py --config CONFIG_v1_1 --ckpt CKPT_v1_1 --input_video_path VIDEO_PATH --input_height 256 --input_width 256 --sample_fps 30 --chunk_size CHUNK_SIZE --output_video_dir OUTPUT_DIR --read_long_video +# Set `CHUNK_SIZE` according to your GPU memory, recommendly 16. +``` +and run the following inference script to [evaluate the reconstruction performance](#performance-evaluation): +```bash +python scripts/inference_evaluate.py --config CONFIG_v1_1 --ckpt CKPT_v1_1 --data_dir DATA_DIR --input_height 256 --input_width 256 --sample_fps 30 --chunk_size CHUNK_SIZE --read_long_video +# Set `CHUNK_SIZE` according to your GPU memory, recommendly 16. +``` + +For an easy usage of VidTok v1.1 models, refer to [this script](#easy-usage) and make the following revision: +```python +# Use VidTok v1.1 models +cfg_path = "configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml" +ckpt_path = "checkpoints/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.ckpt" + +... + +model.to('cuda').eval() +# Using tiling inference to save memory usage +model.use_tiling = True +model.t_chunk_enc = 16 +model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor +model.use_overlap = True +# random input: long video +x_input = (torch.rand(1, 3, 129, 256, 256) * 2 - 1).to('cuda') + +... + +if x_recon.shape[2] != x_input.shape[2]: + x_recon = x_recon[:, :, -x_input.shape[2]:, ...] +``` + +### v1.1: Long Video Fine-tuning +Follow this [training guidance](#fine-tune-on-custom-data) to fine-tune on your custom long video data and note that: +- Compared to VidTok v1.0, we tend to use longer sequences to fine-tune the model (for example, setting `NUM_FRAMES_1` to 33, 49, or larger). +- The resolution and the sequence length of training data should be adjusted according to GPU memory. + +### v1.1: Performance +| Model | Regularizer | Causal | VCR | PSNR | SSIM | LPIPS | FVD | +|------|------|------|------|------|------|------|------| +| [vidtok_kl_causal_488_16chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt) | KL-16chn | ✔️ | 4x8x8 | 35.13 | 0.941 | 0.049 | 87.4 | +| [vidtok_kl_causal_41616_16chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.ckpt) | KL-16chn | ✔️ | 4x16x16 | 29.61 | 0.854 | 0.113 | 162.7 | +| [vidtok_kl_causal_288_8chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.ckpt) | KL-8chn | ✔️ | 2x8x8 | 34.59 | 0.935 | 0.051 | 78.2 | +| [vidtok_fsq_causal_488_32768_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.ckpt) | FSQ-32,768 | ✔️ | 4x8x8 | 29.39 | 0.856 | 0.114 | 168.5 | +| [vidtok_fsq_causal_888_32768_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.ckpt) | FSQ-32,768 | ✔️ | 8x8x8 | 27.95 | 0.817 | 0.142 | 293.2 | +- This is the evaluation result of long video reconstruction conducted on each complete video in [MCL_JCL](https://mcl.usc.edu/mcl-jcv-dataset/) dataset, with a sample fps of 30 and a resolution of `256x256`. + + +## 🔧 Setup +1. Clone this repository and navigate to VidTok folder: +```bash +git clone https://github.com/microsoft/VidTok +cd VidTok +``` +2. We provide an `environment.yaml` file for setting up a Conda environment. Conda's installation instructions are available [here](https://docs.anaconda.com/miniconda/index.html). +```bash +# 1. Prepare conda environment +conda env create -f environment.yaml +# 2. Activate the environment +conda activate vidtok +``` + +We recommend using 1+ high-end GPU for training and inference. We have done all testing and development using A100 and MI300X GPUs. For convenience, we also provide prebuilt [Docker](https://hub.docker.com/) images with required dependencies. You can use it as follows: + +```bash +# NVIDIA GPUs +docker run -it --gpus all --shm-size 256G --rm -v `pwd`:/workspace --workdir /workspace \ + deeptimhe/ubuntu22.04-cuda12.1-python3.10-pytorch2.5:orig-vidtok bash +# AMD GPUs +docker run -it --gpus all --shm-size 256G --rm -v `pwd`:/workspace --workdir /workspace \ + deeptimhe/ubuntu22.04-rocm6.2.4-python3.10-pytorch2.5:orig-vidtok bash +``` + +## 🎈 Checkpoints +Download pre-trained models [here](https://huggingface.co/microsoft/VidTok/tree/main/checkpoints), and put them in `checkpoints` folder, like: +``` +└── checkpoints + ├── vidtok_v1_1 + │ ├── vidtok_kl_causal_488_16chn_v1_1.ckpt + │ └── ... + ├── vidtok_fsq_causal_41616_262144.ckpt + ├── vidtok_fsq_causal_488_262144.ckpt + ├── vidtok_fsq_causal_488_32768.ckpt + ├── vidtok_fsq_causal_488_4096.ckpt + ├── vidtok_fsq_noncausal_41616_262144.ckpt + ├── vidtok_fsq_noncausal_488_262144.ckpt + ├── vidtok_kl_causal_288_8chn.ckpt + ├── vidtok_kl_causal_41616_4chn.ckpt + ├── vidtok_kl_causal_444_4chn.ckpt + ├── vidtok_kl_causal_488_16chn.ckpt + ├── vidtok_kl_causal_488_4chn.ckpt + ├── vidtok_kl_causal_488_8chn.ckpt + ├── vidtok_kl_noncausal_41616_16chn.ckpt + ├── vidtok_kl_noncausal_41616_4chn.ckpt + ├── vidtok_kl_noncausal_488_16chn.ckpt + └── vidtok_kl_noncausal_488_4chn.ckpt +``` +Each checkpoint has a corresponding config file with the same name in `configs` folder. + + +## 🔆 Performance + +| Model | Regularizer | Causal | VCR | PSNR | SSIM | LPIPS | FVD | +|------|------|------|------|------|------|------|------| +| [vidtok_kl_causal_488_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_4chn.ckpt) | KL-4chn | ✔️ | 4x8x8 | 29.64 | 0.852| 0.114| 194.2| +| [vidtok_kl_causal_488_8chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_8chn.ckpt) | KL-8chn | ✔️ |4x8x8 | 31.83 | 0.897| 0.083| 109.3| +| [vidtok_kl_causal_488_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_16chn.ckpt) | KL-16chn | ✔️ | 4x8x8 | 35.04 |0.942 |0.047 | 78.9| +| [vidtok_kl_causal_288_8chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_288_8chn.ckpt) | KL-8chn | ✔️ | 2x8x8 | 33.86 | 0.928 |0.057 | 80.7 | +| [vidtok_kl_causal_444_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_444_4chn.ckpt) | KL-4chn | ✔️ | 4x4x4 | 34.78 | 0.941 | 0.051| 87.2| +| [vidtok_kl_causal_41616_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_41616_4chn.ckpt) | KL-4chn | ✔️ | 4x16x16 | 25.05 | 0.711| 0.228| 549.1| +| [vidtok_kl_noncausal_488_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_488_4chn.ckpt) | KL-4chn | ✖️ | 4x8x8 | 30.60 | 0.876 | 0.098| 157.9| +| [vidtok_kl_noncausal_488_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_488_16chn.ckpt) | KL-16chn | ✖️ | 4x8x8 | 36.13 | 0.950 | 0.044| 60.5| +| [vidtok_kl_noncausal_41616_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_41616_4chn.ckpt) | KL-4chn | ✖️ | 4x16x16 | 26.06 | 0.751 | 0.190|423.2 | +| [vidtok_kl_noncausal_41616_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_41616_16chn.ckpt) | KL-16chn | ✖️ | 4x16x16 | 30.69 | 0.878 | 0.095| 147.1| +| [vidtok_fsq_causal_488_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_262144.ckpt) | FSQ-262,144 | ✔️ | 4x8x8 | 29.82 | 0.867 |0.106 | 160.1| +| [vidtok_fsq_causal_488_32768](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_32768.ckpt) | FSQ-32,768 | ✔️ | 4x8x8 | 29.16 | 0.854 | 0.117| 196.9| +| [vidtok_fsq_causal_488_4096](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_4096.ckpt) | FSQ-4096 | ✔️ | 4x8x8 | 28.36 | 0.832 | 0.133| 218.1| +| [vidtok_fsq_causal_41616_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_41616_262144.ckpt) | FSQ-262,144 | ✔️ | 4x16x16 | 25.38 | 0.738 |0.206 | 430.1| +| [vidtok_fsq_noncausal_488_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_noncausal_488_262144.ckpt) | FSQ-262,144 | ✖️ | 4x8x8 | 30.78 | 0.889| 0.091| 132.1| +| [vidtok_fsq_noncausal_41616_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_noncausal_41616_262144.ckpt) | FSQ-262,144 | ✖️ | 4x16x16 | 26.37 | 0.772| 0.171| 357.0| + +- `VCR` indicates the video compression ratio `TxHxW`. +- The above table shows model performance evaluated on 30 test videos in [MCL_JCL](https://mcl.usc.edu/mcl-jcv-dataset/) dataset, with a sample fps of 30. The input size is `17x256x256` for causal models and `16x256x256` for non-causal models. + +## 🔛 Training + +### Data Preparation +1. Put all training videos under `DATA_DIR`: +``` +└── DATA_DIR + ├── subset1 + │ ├── videoname11.mp4 + │ └── videoname12.mp4 + ├── subset2 + │ ├── videoname21.mp4 + │ ├── videoname22.mp4 + │ └── subsubset1 + │ ├── videoname211.mp4 + │ └── videoname212.mp4 + └── ... +``` +2. Prepare a `.csv` meta file to record the relative paths of these videos with respect to `DATA_DIR`, like: +``` +videos +subset1/videoname11.mp4 +subset2/videoname21.mp4 +subset2/subsubset1/videoname211.mp4 +``` + +> Validation data is also prepared following the above steps. + +### Fine-tune on Custom Data +1. Prepare your own training and validation data following [Data Preparation](#data-preparation). +2. Select the appropriate `CONFIG` file from `configs` folder based on your needs, and modify the following parameters: + - Specify the `ckpt_path` parameter to initialize the model with pre-trained checkpoint parameters: + ```yaml + model: + params: + ckpt_path: PATH_TO_CHECKPOINT # train from existing checkpoint + ``` + - Specify the `data` section to use your own training and validation data: + ```yaml + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: NUM_FRAMES_1 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_1 # sample fps for training data + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: NUM_FRAMES_2 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data + ``` + - Set `fix_encoder` and `fix_decoder` to be `False` to enable full model fine-tuning: + ```yaml + model: + params: + encoder_config: + params: + fix_encoder: false + fix_decoder: false + ``` + - Other hyperparameters according to your needs. + +3. Run the following command to start training: +```bash +python main.py -b CONFIG --logdir LOGDIR + +# You can also use `torchrun` to start the training code. +``` +Training logs and checkpoints are saved in `LOGDIR`. + +It is recommended to use [Weights & Biases](https://wandb.ai/site) as the data visualization tool ([TensorBoard](https://www.tensorflow.org/tensorboard) by default). Use `wandb login` to log in first, and then run: +```bash +python main.py -b CONFIG --logdir LOGDIR --wandb --wandb_entity ENTITY --wandb_project PROJECT +``` + + +### Train from Scratch +
+Two-stage Training +We adopt a two-stage training strategy to improve training efficiency: initially pre-training the full model on low-resolution videos, followed by fine-tuning only the decoder on high-resolution videos. + +| First Stage | Second Stage | Fix encoder | PSNR | SSIM | LPIPS | GPU Hours| +|------|------|------|------|------|------|------| +| 256 x 256 | - | - | 29.19 | 0.843 | 0.127| 3,072| +| 128 x 128 | 256 x 256 | ✔️ | 29.21 | 0.843 | 0.125| 1,536| + +1. Prepare your own training and validation data following [Data Preparation](#data-preparation). +2. Select the appropriate `CONFIG` file from `configs` folder based on your needs, and specify the `data` section to use your own training and validation data: + ```yaml + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 # vary in different training stages + input_width: INPUT_WIDTH_1 # vary in different training stages + sample_num_frames: NUM_FRAMES_1 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_1 # sample fps for training data + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: NUM_FRAMES_2 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data + ``` + +3. Start the first stage of training. First, revise the `CONFIG` file to enable full model training with low-resolution data: +```yaml +model: + params: + # ckpt_path: # disable this parameter so as to train from scratch + encoder_config: + params: + fix_encoder: false + fix_decoder: false +data: + params: + train: + params: + video_params: + input_height: 128 + input_width: 128 +``` +Then revise other hyperparameters according to your needs, and run the training command to start training as in [Fine-tune on Custom Data](#fine-tune-on-custom-data). We train VidTok for 50,000 steps with batch size 16 in this stage. + +4. Start the second stage of training. First, revise the `CONFIG` file to enable the fine-tuning of the decoder with high-resolution data: +```yaml +model: + params: + ckpt_path: CKPT_PATH # path to the saved checkpoint after the first stage of training + encoder_config: + params: + fix_encoder: true + fix_decoder: false +data: + params: + train: + params: + video_params: + input_height: 256 + input_width: 256 +``` +Then revise other hyperparameters according to your needs, and run the training command to start training as in [Fine-tune on Custom Data](#fine-tune-on-custom-data). We train VidTok for 30,000 steps with batch size 8 in this stage. +
+ + +## 🚀 Inference + +### Easy Usage +We provide the following example for a quick usage of our models. It works for both continuous and discrete tokenization and both causal and non-causal models. +Just provide the path to the configuration file `cfg_path` and checkpoint file `ckpt_path`. +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +cfg_path = "configs/vidtok_kl_causal_488_4chn.yaml" +ckpt_path = "checkpoints/vidtok_kl_causal_488_4chn.ckpt" + +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to('cuda').eval() +# random input +num_frames = 17 if model.is_causal else 16 +x_input = (torch.rand(1, 3, num_frames, 256, 256) * 2 - 1).to('cuda') # [B,C,T,H,W], range -1~1 +# model forward +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + _, x_recon, _ = model(x_input) +assert x_input.shape == x_recon.shape +``` +If you want to directly infer from latent tokens, run the following code: +```python +z, reg_log = model.encode(x_input, return_reg_log=True) +# infer from continuous latent space +x_recon = model.decode(z) +# infer from discrete latent tokens +x_recon = model.decode(reg_log['indices'], decode_from_indices=True) +``` + +### Use Torch Compile to Speed Up Inference +Use compiled components in VidTok can speed up inference by as much as 2X. The following code snippet demonstrates how to compile our models. + +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +torch._inductor.config.cpp.weight_prepack=True +torch._inductor.config.freezing=True + +cfg_path = "configs/vidtok_kl_causal_488_4chn.yaml" +ckpt_path = "checkpoints/vidtok_kl_causal_488_4chn.ckpt" + +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to('cuda').eval() +model.encoder = torch.compile(model.encoder) +model.decoder = torch.compile(model.decoder) + +# random input +num_frames = 17 if model.is_causal else 16 +x_input = (torch.rand(1, 3, num_frames, 256, 256) * 2 - 1).to('cuda') # [B,C,T,H,W], range -1~1 + +# Warm Up +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + _, x_recon, _ = model(x_input) + +torch.cuda.synchronize() +import time +start = time.time() +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + for i in range(10): + _, x_recon, _ = model(x_input) +torch.cuda.synchronize() +print(f"Average inference time: {(time.time() - start)/10 :.4f} seconds") +``` + +### Reconstruct an Input Video +```bash +python scripts/inference_reconstruct.py --config CONFIG --ckpt CKPT --input_video_path VIDEO_PATH --input_height 256 --input_width 256 --sample_fps 30 --output_video_dir OUTPUT_DIR +``` +- Specify `VIDEO_PATH` to the path of your test video. We provide an example video in `assets/example.mp4`. +- The reconstructed video is saved in `OUTPUT_DIR`. +- For causal models, you can choose to add `--pad_gen_frames` to the command line, which may improve the smoothness of the reconstructed video. + +### Performance Evaluation +We also provide a manuscript `scripts/inference_evaluate.py` to evaluate the video reconstruction performance in PSNR, SSIM and LPIPS. + +1. Put all of your test videos under `DATA_DIR`. +2. Run the following command, and all `.mp4` videos under `DATA_DIR` will be tested: +```bash +python scripts/inference_evaluate.py --config CONFIG --ckpt CKPT --data_dir DATA_DIR --input_height 256 --input_width 256 --sample_fps 30 +``` +(Optional) If you only want to test certain videos under `DATA_DIR`, you need to prepare a `.csv` meta file +to indicate the video files to be tested (refer to [Data Preparation](#data-preparation)). And add `--meta_path META_PATH` to the above command to specify the path to the `.csv` meta file. + +## 💡 Intended Uses + +We are sharing our model with the research community to foster further research in this area: +* Training your own video tokenizers for research purpose. +* Video tokenization with various compression rates. + + +## 🪧 Out-of-scope Uses + +Our models are not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of video tokenizers (e.g., performance degradation on out-of-domain data) as they select use cases, and evaluate and mitigate for privacy, safety, and fairness before using within a specific downstream use case, particularly for high-risk scenarios. + +Developers should be aware of and adhere to applicable laws or regulations (including privacy, trade compliance laws, etc.) that are relevant to their use case. + + +## 🤖️ Risks and Limitations + +Some of the limitations of this model to be aware of include: +* VidTok may lose detailed information on the reconstructed content. +* VidTok inherits any biases, errors, or omissions characteristic of its training data. +* VidTok was developed for research and experimental purposes. Further testing and validation are needed before considering its application in commercial or real-world scenarios. + + +## 🤗 Acknowledgments + +This codebase borrows code from [generative-models](https://github.com/Stability-AI/generative-models). We thank Stability AI for its efforts and innovations, which have made the development process more efficient and convenient. + +Thank you to everyone who contributed their wisdom and efforts to this project. + +## ✏️ BibTeX + +```bibtex +@article{tang2024vidtok, + title={VidTok: A Versatile and Open-Source Video Tokenizer}, + author={Tang, Anni and He, Tianyu and Guo, Junliang and Cheng, Xinle and Song, Li and Bian, Jiang}, + year={2024}, + journal={arXiv preprint arXiv:2412.13061}, +} +``` + +## ☎️ Contact + +We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at tianyuhe@microsoft.com. + +## 📄 Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + + +## 📍 Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/SECURITY.md b/Meissonic/VidTok/vidtok_cache/VidTok/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..b3c89efc852e22f71eabf5dfbc6ac62493425eb6 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). + + diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/SUPPORT.md b/Meissonic/VidTok/vidtok_cache/VidTok/SUPPORT.md new file mode 100644 index 0000000000000000000000000000000000000000..291d4d43733f4c15a81ff598ec1c99fd6c18f64c --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/SUPPORT.md @@ -0,0 +1,25 @@ +# TODO: The maintainer of this repo has not yet edited this file + +**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? + +- **No CSS support:** Fill out this template with information about how to file issues and get help. +- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. +- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. + +*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* + +# Support + +## How to file issues and get help + +This project uses GitHub Issues to track bugs and feature requests. Please search the existing +issues before filing new issues to avoid duplicates. For new issues, file your bug or +feature request as a new Issue. + +For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE +FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER +CHANNEL. WHERE WILL YOU HELP PEOPLE?**. + +## Microsoft Support Policy + +Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/assets/example.mp4 b/Meissonic/VidTok/vidtok_cache/VidTok/assets/example.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..21be26c8a8f5dbb76de5a225e5b284fdbb024904 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/assets/example.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:588ca89fae7320a079d4f77cf963f88075959f06310594ab35d3e04b844c4d50 +size 540937 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/assets/gemini.png b/Meissonic/VidTok/vidtok_cache/VidTok/assets/gemini.png new file mode 100644 index 0000000000000000000000000000000000000000..fe326dc4b4b4b6db82575b15e9b4bf2f3b63e63d Binary files /dev/null and b/Meissonic/VidTok/vidtok_cache/VidTok/assets/gemini.png differ diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/assets/radar.png b/Meissonic/VidTok/vidtok_cache/VidTok/assets/radar.png new file mode 100644 index 0000000000000000000000000000000000000000..6eff9338928faf73c0e39e859237bc6c588ec183 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/assets/radar.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cfdef783e26264ff671f81845b54471b237f5cd7df0dbd63642fde0c20f935e +size 424581 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin.png b/Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin.png new file mode 100644 index 0000000000000000000000000000000000000000..a7ae41105384d077a7d128d1d687e3ee8e17fbd8 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e58210f7ca7784a4df737f8a2ece2b5e9f85fff3bf423b24aa6aeeb0b196cef +size 549933 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin_demo.png b/Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin_demo.png new file mode 100644 index 0000000000000000000000000000000000000000..ca79126104a7cbfb67c0acb70f03c6e66546675c --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin_demo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8e15c46050cc7957bc5d334bd1902d58b9be9aa7669df82f2ae9ba08c90585 +size 6371846 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..80b2edbebf97fd747933000d32ccbdfe0e135702 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_41616_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_262144.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26f9c3e94ed8b64a681b2ed3887929109b6ccc53 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_262144.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_32768.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_32768.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e661c533b367c7be4eea8de629e074b395e1684 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_32768.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_32768.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_4096.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_4096.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bf654bb21c1bd67ddde3f4878497c6ad780503a --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_4096.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_4096.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8] # codebook size: 8*8*8*8=4096 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e400ff097155465e2a477bfbc3ca32f346a0ca12 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml @@ -0,0 +1,117 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_noncausal_41616_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c8731af435aae0efbde9716ee696ce3efd30d25 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml @@ -0,0 +1,117 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_noncausal_488_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_288_8chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_288_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c20bf1aaf2aa97f67d19d0de1dbac47cf2d55ac --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_288_8chn.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_288_8chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [1] + tempo_us: [2] + time_downsample_factor: 2 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1db41cd612f5e6b1390bebda33e559e347b60907 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_41616_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_444_4chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_444_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bcbbd11cba27e60bd2f44866655adcfff91f91a6 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_444_4chn.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_444_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + spatial_ds: [1, 2] + spatial_us: [1, 2] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_16chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7527679fe0766616df45223fe1e9595101fadc5 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_16chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_4chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d14b0d75435b24affad90095e4d6e42c6525a8d --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_8chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..336cbac8319dc3f9c232c02889e598fdf098a777 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_8chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_8chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b32064f1589eb3b21cb978f808a95fd6ade31ff9 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_41616_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdd33a5ebc635cbf9a04fb94006d14a473d87a3c --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_41616_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47a1090f42b07655626f0d522d756157fdeb1d6a --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_488_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05bb8c54df1374c0f67c79d256195dd6451acfbd --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74364fa3209abbda65b0c23311d92ad0975570dd --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml @@ -0,0 +1,120 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..334749091ff2c561208f169029eae4704d4213e3 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml @@ -0,0 +1,120 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2966072ff2a073d404d0cb438674acfc92319033 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml @@ -0,0 +1,122 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [0, 1, 2] + tempo_us: [1, 2, 3] + time_downsample_factor: 8 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..deefcc29ecd8212e8bd3edd9d9870d8c64079db7 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml @@ -0,0 +1,116 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [1] + tempo_us: [2] + time_downsample_factor: 2 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..965b243859e513244d3d3fd9cc68aa27aee887da --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26f68342b48dd5fcae8e5bfcd2fcb5cc5bd1ab1c --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4badafed742c03fd7850f197cdb8207c59b992e --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bcd59650c229ac57560602854015262969d658b --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml @@ -0,0 +1,154 @@ +model: + base_learning_rate: 1.6e-4 + target: vidtwin.models.vidtwin_ae.VidAutoEncoderQformerCompactSymVidVAE + params: + input_key: jpg + monitor: val/rec_loss + ckpt_path: PATH_TO_CHECKPOINT + ignore_keys: [] + expect_ch: 8 + cont_num_blocks: 1 + downsample_motion: True + motion_num_blocks: 1 + d_dim: 8 + + temporal_qformer_config: + target: vidtwin.modules.qformer.MyQformerInterface + params: + num_query_tokens: 16 + query_hidden_size: 64 + encoder_hidden_size: 768 + + encoder_config: + target: vidtwin.modules.st_transformer.STTEncoder + params: + in_channels: 3 + input_size: [16, 224, 224] + patch_size: [1, 16, 16] + hidden_size: 768 + depth: 16 + num_heads: 12 + temporal_casual: true + + decoder_config: + target: vidtwin.modules.st_transformer.STTDecoder + params: + in_channels: 3 + input_size: [16, 224, 224] + patch_size: [1, 16, 16] + hidden_size: 768 + depth: 16 + num_heads: 12 + temporal_casual: true + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + perceptual_weight: 0.05 + disc_start: 20001 + disc_weight: 0.05 + learn_logvar: True + dims: 3 + disc_type: 2d + regularization_weights: + kl_loss: 0.001 + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + params: + sample: True + + + lr_scheduler_config_d: + target: vidtok.models.vidtwin_ae.LambdaWarmUpCosineScheduler + params: + lr_min: 0 + lr_max: 1.5e-05 + lr_start: 1.0e-05 + warmup_steps: 5000 + lr_scheduler_config_g: + target: vidtok.models.vidtwin_ae.LambdaWarmUpCosineScheduler + params: + lr_min: 0 + lr_max: 3.0e-05 + lr_start: 0 + warmup_steps: 5000 + optimizer_config: + target: torch.optim.AdamW + params: + betas: + - 0 + - 0.9 + weight_decay: 0.0001 + lr_scheduler_config: + target: inverse_sqrt + params: + num_warmup_steps: 2000 + frequency: 1 + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: 224 + input_width: 224 + sample_num_frames: 16 + sample_fps: 8 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: 224 + input_width: 224 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: True + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 2 + + + + trainer: + # precision: bf16-mixed # 16-mixed + benchmark: True + devices: 4 + num_sanity_val_steps: 10 + val_check_interval: 5000 + accumulate_grad_batches: 1 + max_epochs: 10 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/environment.yaml b/Meissonic/VidTok/vidtok_cache/VidTok/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6abf1dcc7cefdab27b272681c0395791b3d432f --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/environment.yaml @@ -0,0 +1,114 @@ +name: vidtok +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2024.11.26=h06a4308_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.15=h5eee18b_0 + - pip=24.2=py310h06a4308_0 + - python=3.10.15=he870216_1 + - readline=8.2=h5eee18b_0 + - setuptools=75.1.0=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h39e8969_0 + - wheel=0.44.0=py310h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.1.0 + - aiohappyeyeballs==2.4.4 + - aiohttp==3.11.9 + - aiosignal==1.3.1 + - antlr4-python3-runtime==4.9.3 + - appdirs==1.4.4 + - async-timeout==5.0.1 + - attrs==24.2.0 + - av==12.0.0 + - beartype==0.18.2 + - certifi==2024.8.30 + - charset-normalizer==3.4.0 + - click==8.1.7 + - contourpy==1.3.1 + - cycler==0.12.1 + - decord==0.6.0 + - docker-pycreds==0.4.0 + - einops==0.8.0 + - filelock==3.16.1 + - fonttools==4.55.1 + - frozenlist==1.5.0 + - fsspec==2024.10.0 + - gitdb==4.0.11 + - gitpython==3.1.43 + - grpcio==1.68.1 + - idna==3.10 + - imageio==2.34.0 + - jinja2==3.1.4 + - kiwisolver==1.4.7 + - lightning==2.2.4 + - lightning-utilities==0.11.9 + - markdown==3.7 + - markdown-it-py==3.0.0 + - markupsafe==3.0.2 + - matplotlib==3.8.4 + - mdurl==0.1.2 + - mpmath==1.3.0 + - multidict==6.1.0 + - natsort==8.4.0 + - networkx==3.4.2 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.19.3 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.1.105 + - omegaconf==2.3.0 + - opencv-python==4.6.0.66 + - packaging==24.2 + - pandas==2.1.4 + - pillow==11.0.0 + - propcache==0.2.1 + - protobuf==4.25.5 + - psutil==6.1.0 + - pygments==2.18.0 + - pyparsing==3.2.0 + - python-dateutil==2.9.0.post0 + - pytorch-lightning==2.2.4 + - pytz==2024.2 + - pyyaml==6.0.2 + - requests==2.32.3 + - rich==13.5.3 + - safetensors==0.4.2 + - sentry-sdk==2.19.0 + - setproctitle==1.3.4 + - six==1.17.0 + - smmap==5.0.1 + - sympy==1.13.3 + - tensorboard==2.16.2 + - tensorboard-data-server==0.7.2 + - torch==2.2.2 + - torchmetrics==1.6.0 + - torchvision==0.17.2 + - tqdm==4.67.1 + - triton==2.2.0 + - typing-extensions==4.12.2 + - tzdata==2024.2 + - urllib3==2.2.3 + - wandb==0.16.6 + - werkzeug==3.1.3 + - yarl==1.18.3 diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/main.py b/Meissonic/VidTok/vidtok_cache/VidTok/main.py new file mode 100644 index 0000000000000000000000000000000000000000..e03f0de8a1c683fbdde7177083c87a0a3df85f83 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/main.py @@ -0,0 +1,1124 @@ +import argparse +import datetime +import pytz +import glob +import inspect +import os +import re +import sys +import numpy as np +import warnings +warnings.filterwarnings("ignore") +from rich import print +from inspect import Parameter +from typing import Union +from matplotlib import pyplot as plt +from natsort import natsorted +from omegaconf import OmegaConf +from packaging import version +from PIL import Image +from pathlib import Path + +import torch +import torch.distributed as dist +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch import seed_everything +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +from vidtok.modules.util import (exists, instantiate_from_config, isheatmap, + print0, seed_anything) + +MULTINODE_HACKS = True + + +def default_trainer_args(): + argspec = dict(inspect.signature(Trainer.__init__).parameters) + argspec.pop("self") + default_args = { + param: argspec[param].default + for param in argspec + if argspec[param] != Parameter.empty + } + return default_args + + +def get_step_value(folder_name): + match = re.search(r"step=(\d+)", folder_name) + if match: + return int(match.group(1)) + return 0 + + +def get_parser(**parser_kwargs): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser(**parser_kwargs) + parser.add_argument( + "-n", + "--name", + type=str, + const=True, + default="", + nargs="?", + help="postfix for logdir", + ) + parser.add_argument( + "--no_date", + type=str2bool, + nargs="?", + const=True, + default=False, + help="if True, skip date generation for logdir and only use naming via opt.base or opt.name (+ opt.postfix, optionally)", + ) + parser.add_argument( + "-r", + "--resume", + type=str, + const=True, + default="", + nargs="?", + help="resume from logdir or checkpoint in logdir", + ) + parser.add_argument( + "-b", + "--base", + nargs="*", + metavar="base_config.yaml", + help="paths to base configs. Loaded from left-to-right. " + "Parameters can be overwritten or added with command-line options of the form `--key value`.", + default=list(), + ) + parser.add_argument( + "-t", + "--train", + type=str2bool, + const=True, + default=True, + nargs="?", + help="train", + ) + parser.add_argument( + "--no-test", + type=str2bool, + const=True, + default=True, + nargs="?", + help="disable test", + ) + parser.add_argument( + "-p", "--project", help="name of new or path to existing project" + ) + parser.add_argument( + "-d", + "--debug", + type=str2bool, + nargs="?", + const=True, + default=False, + help="enable post-mortem debugging", + ) + parser.add_argument( + "-s", + "--seed", + type=int, + default=23, + help="seed for seed_everything", + ) + parser.add_argument( + "--seed_rank", + type=str2bool, + nargs="?", + const=True, + default=False, + help="reset seed every rank on fit start", + ) + parser.add_argument( + "-f", + "--postfix", + type=str, + default="", + help="post-postfix for default name", + ) + parser.add_argument( + "-l", + "--logdir", + type=str, + default="logs", + help="directory for logging dat shit", + ) + parser.add_argument( + "--scale_lr", + type=str2bool, + nargs="?", + const=True, + default=False, + help="scale base-lr by ngpu * batch_size * n_accumulate", + ) + parser.add_argument( + "--legacy_naming", + type=str2bool, + nargs="?", + const=True, + default=False, + help="name run based on config file name if true, else by whole path", + ) + parser.add_argument( + "--enable_tf32", + type=str2bool, + nargs="?", + const=True, + default=True, + help="enables the TensorFloat32 format both for matmuls and cuDNN for pytorch 1.12", + ) + parser.add_argument( + "--startup", + type=str, + default=None, + help="Startuptime from distributed script", + ) + parser.add_argument( + "--wandb", + type=str2bool, + nargs="?", + const=True, + default=False, + help="log to wandb", + ) + parser.add_argument( + "--wandb_entity", + type=str, + default="", + help="Wandb entity name string", + ) + parser.add_argument( + "--wandb_key", + type=str, + default="", + help="Wandb key", + ) + parser.add_argument( + "--wandb_project", + type=str, + default="vidtok", + ) + parser.add_argument( + "--wandb_id", + type=str, + default=None, + help="automatically resume from the same wandb id" + "must be used in combination with --wandb_auto_resume False", + ) + parser.add_argument( + "--wandb_auto_resume", + type=str2bool, + nargs="?", + const=True, + default=True, + help="will find the latest run id in the logdir" + "if checkpoint_auto_resume is False, wandb_auto_resume will be ignored", + ) + parser.add_argument( + "--checkpoint_auto_resume", + type=str2bool, + nargs="?", + const=True, + default=True, + help="will find the latest checkpoint in the logdir" + "if checkpoint_auto_resume is False, wandb_auto_resume will be ignored", + ) + parser.add_argument( + "--no_base_name", + type=str2bool, + nargs="?", + const=True, + default=False, # TODO: later default to True + help="log to wandb", + ) + if version.parse(torch.__version__) >= version.parse("2.0.0"): + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="single checkpoint file to resume from", + ) + default_args = default_trainer_args() + for key in default_args: + # parameters in the pl.Trainer are passed as --key value + parser.add_argument("--" + key, default=default_args[key]) + return parser + + +def get_checkpoint_name(logdir): + ckpt = os.path.join(logdir, "checkpoints", "last**.ckpt") + ckpt = natsorted(glob.glob(ckpt)) + print0('available "last" checkpoints:') + print0(ckpt) + if len(ckpt) > 1: + print0("got most recent checkpoint") + ckpt = sorted(ckpt, key=lambda x: os.path.getmtime(x))[-1] + print0(f"Most recent ckpt is {ckpt}") + with open(os.path.join(logdir, "most_recent_ckpt.txt"), "w") as f: + f.write(ckpt + "\n") + try: + version = int(ckpt.split("/")[-1].split("-v")[-1].split(".")[0]) + except Exception as e: + print0("version confusion but not bad") + print0(e) + version = 1 + # version = last_version + 1 + else: + # in this case, we only have one "last.ckpt" + ckpt = ckpt[0] + version = 1 + melk_ckpt_name = f"last-v{version}.ckpt" + print0(f"Current melk ckpt name: {melk_ckpt_name}") + return ckpt, melk_ckpt_name + + +class SetupCallback(Callback): + def __init__( + self, + resume, + now, + logdir, + ckptdir, + cfgdir, + config, + lightning_config, + debug, + save_ckpt_on_exception=False, + ckpt_name=None, + seed=None, + seed_rank=False, + ): + super().__init__() + self.resume = resume + self.now = now + self.logdir = logdir + self.ckptdir = ckptdir + self.cfgdir = cfgdir + self.config = config + self.lightning_config = lightning_config + self.debug = debug + self.save_ckpt_on_exception = save_ckpt_on_exception + self.ckpt_name = ckpt_name + self.seed = seed + self.seed_rank = seed_rank + + def on_exception(self, trainer: pl.Trainer, pl_module, exception): + if self.save_ckpt_on_exception and (not self.debug) and (trainer.global_rank == 0): + print0(f"[bold red]\[main][SetupCallback][/bold red] Saving checkpoint to {self.ckptdir}") + if self.ckpt_name is None: + ckpt_path = os.path.join(self.ckptdir, "last.ckpt") + else: + ckpt_path = os.path.join(self.ckptdir, self.ckpt_name) + trainer.save_checkpoint(ckpt_path) + + def on_fit_start(self, trainer, pl_module): + if self.seed_rank: + # current_seed = torch.initial_seed() + seed_anything(self.seed + trainer.global_rank) + print(f"[bold red]\[main][SetupCallback][/bold red] Rank {trainer.global_rank}: Reset GLOBAL seed to {self.seed + trainer.global_rank}") + elif hasattr(pl_module, "set_seed") and callable(pl_module.set_seed): + pl_module.set_seed(self.seed) + print0(f"[bold red]\[main][SetupCallback][/bold red] Set pl_module seed to {self.seed} with pl_module.set_seed") + if trainer.global_rank == 0: + # Create logdirs and save configs + print0(f"[bold red]\[main][SetupCallback][/bold red] Creating logdir: {self.logdir}, ckptdir: {self.ckptdir}, cfgdir: {self.cfgdir}") + os.makedirs(self.logdir, exist_ok=True) + os.makedirs(self.ckptdir, exist_ok=True) + os.makedirs(self.cfgdir, exist_ok=True) + + if "callbacks" in self.lightning_config: + if ( + "metrics_over_trainsteps_checkpoint" + in self.lightning_config["callbacks"] + ): + os.makedirs( + os.path.join(self.ckptdir, "trainstep_checkpoints"), + exist_ok=True, + ) + print0("[bold red]\[main][SetupCallback][/bold red] Project config") + print0(OmegaConf.to_yaml(self.config)) + if MULTINODE_HACKS and not self.debug: + import time + time.sleep(5) + OmegaConf.save( + self.config, + os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)), + ) + + print0("[bold red]\[main][SetupCallback][/bold red] Lightning config") + print0(OmegaConf.to_yaml(self.lightning_config)) + OmegaConf.save( + OmegaConf.create({"lightning": self.lightning_config}), + os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)), + ) + + else: + # ModelCheckpoint callback created log directory --- remove it + if not MULTINODE_HACKS and not self.resume and os.path.exists(self.logdir): + dst, name = os.path.split(self.logdir) + dst = os.path.join(dst, "child_runs", name) + os.makedirs(os.path.split(dst)[0], exist_ok=True) + try: + os.rename(self.logdir, dst) + except FileNotFoundError: + pass + + +class ImageLogger(Callback): + def __init__( + self, + batch_frequency, + max_samples, + clamp=True, + increase_log_steps=True, + rescale=True, + disabled=True, + log_on_batch_idx=False, + log_first_step=False, + log_images_kwargs=None, + log_before_first_step=False, + enable_autocast=True, + ): + super().__init__() + self.enable_autocast = enable_autocast + self.rescale = rescale + self.batch_freq = batch_frequency + self.max_samples = max_samples + self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)] + if not increase_log_steps: + self.log_steps = [self.batch_freq] + self.clamp = clamp + self.disabled = disabled + self.log_on_batch_idx = log_on_batch_idx + self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} + self.log_first_step = log_first_step + self.log_before_first_step = log_before_first_step + + @rank_zero_only + def log_local( + self, + save_dir, + split, + images, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "images", split) + for k in images: + if isheatmap(images[k]): + fig, ax = plt.subplots() + ax = ax.matshow( + images[k].cpu().numpy(), cmap="hot", interpolation="lanczos" + ) + plt.colorbar(ax) + plt.axis("off") + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + plt.savefig(path) + plt.close() + # TODO: support wandb + else: + grid = torchvision.utils.make_grid(images[k], nrow=4) + if self.rescale: + grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) + grid = grid.numpy() + grid = (grid * 255).astype(np.uint8) + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + path = os.path.join(root, filename) + os.makedirs(os.path.split(path)[0], exist_ok=True) + img = Image.fromarray(grid) + img.save(path) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "logger_log_image only supports WandbLogger currently" + pl_module.logger.log_image( + key=f"{split}/{k}", + images=[ + img, + ], + step=pl_module.global_step, + ) + + @rank_zero_only + def log_img(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + self.check_frequency(check_idx) + and hasattr(pl_module, "log_images") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_images) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + gpu_autocast_kwargs = { + "enabled": self.enable_autocast, # torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled(), + } + with torch.no_grad(), torch.cuda.amp.autocast(**gpu_autocast_kwargs): + images = pl_module.log_images( + batch, split=split, **self.log_images_kwargs + ) + + for k in images: + N = min(images[k].shape[0], self.max_samples) + if not isheatmap(images[k]): + images[k] = images[k][:N] + if isinstance(images[k], torch.Tensor): + images[k] = images[k].detach().float().cpu() + if self.clamp and not isheatmap(images[k]): + images[k] = torch.clamp(images[k], -1.0, 1.0) + + self.log_local( + pl_module.logger.save_dir, + split, + images, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + def check_frequency(self, check_idx): + if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( + check_idx > 0 or self.log_first_step + ): + try: + self.log_steps.pop(0) + except IndexError as e: + print0("[bold red]\[main][ImageLogger][/bold red]", e) + pass + return True + return False + + @rank_zero_only + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): + self.log_img(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + if self.log_before_first_step and pl_module.global_step == 0: + print0(f"[bold red]\[main][ImageLogger][/bold red] {self.__class__.__name__}: logging before training") + self.log_img(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs + ): + if not self.disabled and pl_module.global_step > 0: + self.log_img(pl_module, batch, batch_idx, split="val") + if hasattr(pl_module, "calibrate_grad_norm"): + if ( + pl_module.calibrate_grad_norm and batch_idx % 25 == 0 + ) and batch_idx > 0: + self.log_gradients(trainer, pl_module, batch_idx=batch_idx) + + +@rank_zero_only +def init_wandb(save_dir, opt, config, group_name, name_str): + print0(f"[bold red]\[main][init_wandb][/bold red] Creating WANDB_DIR: {save_dir}") + os.makedirs(save_dir, exist_ok=True) + + # os.environ["WANDB_DIR"] = save_dir + gitcmd = f'git config --global --add safe.directory {os.path.dirname(os.path.abspath(__file__))}' + os.system(gitcmd) + print0(f"[bold red]\[main][init_wandb][/bold red] wandb_id is set to {opt.wandb_id}") + wandb_id = opt.wandb_id if opt.wandb_id is not None else name_str + + if not wandb.api.api_key: + wandb.login(key=opt.wandb_key) + if opt.debug: + wandb.init(project=opt.wandb_project, mode="offline", group=group_name) + else: + wandb.init( + project=opt.wandb_project, + entity=opt.wandb_entity, + config=dict(config), + group=group_name, + name=name_str, + resume='auto', + id=wandb_id, + ) + + +if __name__ == "__main__": + # custom parser to specify config files, train, test and debug mode, + # postfix, resume. + # `--key value` arguments are interpreted as arguments to the trainer. + # `nested.key=value` arguments are interpreted as config parameters. + # configs are merged from left-to-right followed by command line parameters. + + # model: + # base_learning_rate: float + # target: path to lightning module + # params: + # key: value + # data: + # target: main.DataModuleFromConfig + # params: + # batch_size: int + # wrap: bool + # train: + # target: path to train dataset + # params: + # key: value + # validation: + # target: path to validation dataset + # params: + # key: value + # test: + # target: path to test dataset + # params: + # key: value + # lightning: (optional, has sane defaults and can be specified on cmdline) + # trainer: + # additional arguments to trainer + # logger: + # logger to instantiate + # modelcheckpoint: + # modelcheckpoint to instantiate + # callbacks: + # callback1: + # target: importpath + # params: + # key: value + + now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + + # add cwd for convenience and to make classes in this file available when + # running as `python main.py` + # (in particular `main.DataModuleFromConfig`) + sys.path.append(os.getcwd()) + + parser = get_parser() + + opt, unknown = parser.parse_known_args() + + if opt.name and opt.resume: + raise ValueError( + "-n/--name and -r/--resume cannot be specified both." + "If you want to resume training in a new log folder, " + "use -n/--name in combination with --resume_from_checkpoint" + ) + melk_ckpt_name = None + name = None + if opt.resume: + if not os.path.exists(opt.resume): + raise ValueError("Cannot find {}".format(opt.resume)) + if os.path.isfile(opt.resume): + paths = opt.resume.split("/") + # idx = len(paths)-paths[::-1].index("logs")+1 + # logdir = "/".join(paths[:idx]) + logdir = "/".join(paths[:-2]) + ckpt = opt.resume + _, melk_ckpt_name = get_checkpoint_name(logdir) + else: + assert os.path.isdir(opt.resume), opt.resume + logdir = opt.resume.rstrip("/") + ckpt, melk_ckpt_name = get_checkpoint_name(logdir) + + print0("-" * 80) + print0(f'[bold red][main][/bold red] Resuming from checkpoint "{ckpt}"') + + opt.resume_from_checkpoint = ckpt + base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml"))) + opt.base = base_configs + opt.base + _tmp = logdir.split("/") + nowname = _tmp[-1] + else: + if opt.name: + name = "_" + opt.name + elif opt.base: + if opt.no_base_name: + name = "" + else: + if opt.legacy_naming: + cfg_fname = os.path.split(opt.base[0])[-1] + cfg_name = os.path.splitext(cfg_fname)[0] + else: + assert "configs" in os.path.split(opt.base[0])[0], os.path.split( + opt.base[0] + )[0] + cfg_path = os.path.split(opt.base[0])[0].split(os.sep)[ + os.path.split(opt.base[0])[0].split(os.sep).index("configs") + + 1 : + ] # cut away the first one (we assert all configs are in "configs") + cfg_name = os.path.splitext(os.path.split(opt.base[0])[-1])[0] + cfg_name = "-".join(cfg_path) + f"-{cfg_name}" + name = "_" + cfg_name + else: + name = "" + # automatic resume last checkpoint if available + if os.path.exists(opt.logdir): + auto_resumed = False + for sub_dir in sorted(os.listdir(opt.logdir)): + if sub_dir.endswith(name + opt.postfix): + ## checkpoint resume + if opt.checkpoint_auto_resume and not opt.debug: + checkpoint_dir = os.path.join(opt.logdir, sub_dir, "checkpoints") + # Use the max step checkpoint file + ckpt_files1 = glob.glob(os.path.join(checkpoint_dir, "*/*.ckpt")) + ckpt_files2 = glob.glob(os.path.join(checkpoint_dir, "*.ckpt")) + ckpt_files = ckpt_files1 + ckpt_files2 + ckpt_files.sort(key=get_step_value, reverse=True) + if ckpt_files: + ckpt = ckpt_files[0] + else: + # If no checkpoint files found, use a random initialized model + ckpt = None + if ckpt is not None and os.path.isfile(ckpt): + opt.resume_from_checkpoint = ckpt + auto_resumed = True + # print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Find previous log dir and checkpoint: {ckpt}") + ## wandb resume + if opt.wandb_auto_resume: + wandb_dir = Path(os.path.join(opt.logdir, sub_dir)) / "wandb" + if wandb_dir.exists() and any((wandb_dir / "latest-run").iterdir()): + # Parse unique `run_id` from the `.wandb.` file... + wandb_fns = [f.name for f in (wandb_dir / "latest-run").iterdir() if f.name.endswith(".wandb")] + assert len(wandb_fns) == 1, f"There should only be 1 `.wandb.` file... found {len(wandb_fns)}!" + # Regex Match on `run-{id}.wandb` + opt.wandb_id = re.search("run-(.+?).wandb", wandb_fns[0]).group(1) + # print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Find previous wandb run id: {opt.wandb_id}") + if auto_resumed: + print0(f"[bold red]\[main][/bold red] Auto-resuming from checkpoint: {opt.resume_from_checkpoint} and wandb id: {opt.wandb_id}") + ckpt_basename = os.path.basename(opt.resume_from_checkpoint) + seed_str = ''.join(re.findall(r'\d+', ckpt_basename)) + if len(seed_str) > 0: + opt.seed = int(seed_str) + print0(f"[bold red]\[main][/bold red] Auto-reseting seed to {opt.seed} from checkpoint name") + + if not opt.no_date: + nowname = now + name + opt.postfix + else: + nowname = name + opt.postfix + if nowname.startswith("_"): + nowname = nowname[1:] + logdir = os.path.join(opt.logdir, nowname) + print0(f"[bold red]\[main][/bold red] LOGDIR: {logdir}") + + ckptdir = os.path.join(logdir, "checkpoints") + cfgdir = os.path.join(logdir, "configs") + if not opt.seed_rank: + seed_everything(opt.seed, workers=True) # torch.initial_seed() + + # move before model init, in case a torch.compile(...) is called somewhere + if opt.enable_tf32: + # pt_version = version.parse(torch.__version__) + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + print0(f"[bold red]\[main][/bold red] Enabling TF32 for PyTorch {torch.__version__}") + else: + print0(f"[bold red]\[main][/bold red] Using default TF32 settings for PyTorch {torch.__version__}:") + print0(f"[bold red]\[main][/bold red] torch.backends.cuda.matmul.allow_tf32={torch.backends.cuda.matmul.allow_tf32}") + print0(f"[bold red]\[main][/bold red] torch.backends.cudnn.allow_tf32={torch.backends.cudnn.allow_tf32}") + + try: + # init and save configs + configs = [OmegaConf.load(cfg) for cfg in opt.base] + # deal with the unknown args, e.g., --model.base_learning_rate=1.0e-4 + for i, u in enumerate(unknown): + if u.startswith("--"): + unknown[i] = u[2:] + # merge all configs and cli args + cli = OmegaConf.from_dotlist(unknown) + config = OmegaConf.merge(*configs, cli) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Merged input config: {config}") + lightning_config = config.pop("lightning", OmegaConf.create()) + # merge trainer cli with config + trainer_config = lightning_config.get("trainer", OmegaConf.create()) + + # debug: default to one node + if opt.debug: + trainer_config["num_nodes"] = 1 + + # default profiler + trainer_config["profiler"] = None if not opt.debug else "simple" + + # default to gpu + trainer_config["accelerator"] = "gpu" + # + standard_args = default_trainer_args() + for k in standard_args: + if getattr(opt, k) != standard_args[k]: + trainer_config[k] = getattr(opt, k) + + if not "devices" in trainer_config and trainer_config["accelerator"] != "gpu": + del trainer_config["accelerator"] + cpu = True + else: + gpuinfo = trainer_config["devices"] + print0(f"[bold red]\[main][/bold red] Running on {gpuinfo} GPUs") + cpu = False + trainer_opt = argparse.Namespace(**trainer_config) + lightning_config.trainer = trainer_config + + # model + model = instantiate_from_config(config.model) + + # trainer and callbacks + trainer_kwargs = dict() + + # default logger configs + default_logger_cfgs = { + "wandb": { + "target": "lightning.pytorch.loggers.WandbLogger", + "params": { + "name": nowname, + "save_dir": logdir, + "offline": opt.debug, + "id": nowname, + "project": opt.wandb_project, + "log_model": False, + "entity": opt.wandb_entity, + }, + }, + "csv": { + "target": "lightning.pytorch.loggers.CSVLogger", + "params": { + "name": "testtube", # hack for sbord fanatics + "save_dir": logdir, + }, + }, + "tensorboard": { + "target": "lightning.pytorch.loggers.TensorBoardLogger", + "params": { + "save_dir": logdir, + "name": 'tensorboard', + "version": nowname, + } + }, + } + default_logger_cfg = default_logger_cfgs["wandb" if opt.wandb else "tensorboard"] + if opt.wandb: + # change once leaving "swiffer" config directory + try: + group_name = nowname.split(now)[-1].split("-")[1] + except: + group_name = nowname + default_logger_cfg["params"]["group"] = group_name + + wandb_save_dir = os.path.join(os.getcwd(), logdir) + os.environ["WANDB_DIR"] = wandb_save_dir + + init_wandb( + wandb_save_dir, + opt=opt, + group_name=group_name, + config=config, + name_str=nowname, + ) + if "logger" in lightning_config: + logger_cfg = lightning_config.logger + else: + logger_cfg = OmegaConf.create() + logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg) + trainer_kwargs["logger"] = instantiate_from_config(logger_cfg) + + ckpt_resume_path = opt.resume_from_checkpoint + + # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to + # specify which metric is used to determine best models + default_modelckpt_cfg = { + "target": "lightning.pytorch.callbacks.ModelCheckpoint", + "params": { + "dirpath": ckptdir, + "filename": "{epoch:04}-{step:08}", # "epoch={epoch:06}-step={step:07}" + "verbose": True, + "save_last": True, + "auto_insert_metric_name": True, + }, + } + if hasattr(model, "monitor"): + print0(f"[bold red]\[main][/bold red] Monitoring {model.monitor} as checkpoint metric.") + default_modelckpt_cfg["params"]["monitor"] = model.monitor + default_modelckpt_cfg["params"]["save_top_k"] = 3 + + if "modelcheckpoint" in lightning_config: + modelckpt_cfg = lightning_config.modelcheckpoint + else: + modelckpt_cfg = OmegaConf.create() + modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Merged modelckpt-cfg: {modelckpt_cfg}") + + # https://pytorch-lightning.readthedocs.io/en/stable/extensions/strategy.html + # default to ddp if not further specified + default_strategy_config = {"target": "lightning.pytorch.strategies.DDPStrategy"} + + if "strategy" in lightning_config: + strategy_cfg = lightning_config.strategy + else: + strategy_cfg = OmegaConf.create() + default_strategy_config["params"] = { + "find_unused_parameters": False, + # "static_graph": True, + # "ddp_comm_hook": default.fp16_compress_hook # experiment with this, also for DDPSharded + } + strategy_cfg = OmegaConf.merge(default_strategy_config, strategy_cfg) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] strategy config: {strategy_cfg}") + trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg) + if hasattr(trainer_kwargs["strategy"], "_timeout"): + trainer_kwargs["strategy"]._timeout = datetime.timedelta(seconds=5400) # 3600s = 1h + + # add callback which sets up log directory + default_callbacks_cfg = { + "setup_callback": { + "target": "main.SetupCallback", + "params": { + "resume": opt.resume, + "now": now, + "logdir": logdir, + "ckptdir": ckptdir, + "cfgdir": cfgdir, + "config": config, + "lightning_config": lightning_config, + "debug": opt.debug, + "ckpt_name": melk_ckpt_name, + "seed": opt.seed, + "seed_rank": opt.seed_rank + }, + }, + "image_logger": { + "target": "main.ImageLogger", + "params": {"batch_frequency": 1000, "max_samples": 4, "clamp": True}, + }, + "learning_rate_logger": { + "target": "lightning.pytorch.callbacks.LearningRateMonitor", + "params": { + "logging_interval": "step", + # "log_momentum": True + }, + }, + } + if version.parse(pl.__version__) >= version.parse("1.4.0"): + default_callbacks_cfg.update({"checkpoint_callback": modelckpt_cfg}) + + if "callbacks" in lightning_config: + callbacks_cfg = lightning_config.callbacks + else: + callbacks_cfg = OmegaConf.create() + + if "metrics_over_trainsteps_checkpoint" in callbacks_cfg: + print0( + "[bold red]\[main][/bold red] Caution: Saving checkpoints every n train steps without deleting. This might require some free space." + ) + default_metrics_over_trainsteps_ckpt_dict = { + "metrics_over_trainsteps_checkpoint": { + "target": "lightning.pytorch.callbacks.ModelCheckpoint", + "params": { + "dirpath": os.path.join(ckptdir, "trainstep_checkpoints"), + "filename": "{epoch:04}-{step:08}", # "{epoch:06}-{step:09}" + "verbose": True, + "save_top_k": -1, + "every_n_train_steps": 10000, + "save_weights_only": True, + }, + } + } + default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict) + + callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg) + if "ignore_keys_callback" in callbacks_cfg and ckpt_resume_path is not None: + callbacks_cfg.ignore_keys_callback.params["ckpt_path"] = ckpt_resume_path + elif "ignore_keys_callback" in callbacks_cfg: + del callbacks_cfg["ignore_keys_callback"] + + trainer_kwargs["callbacks"] = [ + instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg + ] + if not "plugins" in trainer_kwargs: + trainer_kwargs["plugins"] = list() + + # cmd line trainer args (which are in trainer_opt) have always priority over config-trainer-args (which are in trainer_kwargs) + trainer_opt = vars(trainer_opt) + trainer_kwargs = { + key: val for key, val in trainer_kwargs.items() if key not in trainer_opt + } + trainer = Trainer(**trainer_opt, **trainer_kwargs) + + trainer.logdir = logdir + + # data + if ((not opt.train) or opt.debug) and hasattr(config.data.params, "validation"): + config.data.params.train = config.data.params.validation + print0("[bold red]\[main][/bold red] Using validation data as training data for fast loading.") + data = instantiate_from_config(config.data) + # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + # calling these ourselves should not be necessary but it is. + # lightning still takes care of proper multiprocessing though + data.prepare_data() + # data.setup() + try: + for k in data.datasets: + print0( + f"[bold red]\[main][/bold red] {k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}" + ) + except: + print0("[bold red]\[main][/bold red] datasets not yet initialized.") + + # configure learning rate + if "batch_size" in config.data.params: + bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate + else: + bs, base_lr = ( + config.data.params.train.loader.batch_size, + config.model.base_learning_rate, + ) + if not cpu: + # add for different device input type + if isinstance(lightning_config.trainer.devices, int): + ngpu = lightning_config.trainer.devices + elif isinstance(lightning_config.trainer.devices, list): + ngpu = len(lightning_config.trainer.devices) + elif isinstance(lightning_config.trainer.devices, str): + ngpu = len(lightning_config.trainer.devices.strip(",").split(",")) + else: + ngpu = 1 + if "accumulate_grad_batches" in lightning_config.trainer: + accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches + else: + accumulate_grad_batches = 1 + print0(f"[bold red]\[main][/bold red] accumulate_grad_batches = {accumulate_grad_batches}") + lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches + + if opt.scale_lr: + model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr + print0( + "[bold red]\[main][/bold red] Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format( + model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr + ) + ) + else: + model.learning_rate = base_lr + print0("[bold red]\[main][/bold red] NOT using learning rate scaling") + print0(f"[bold red]\[main][/bold red] Setting learning rate to {model.learning_rate:.2e}") + + # allow checkpointing via USR1 + def melk(*args, **kwargs): + # run all checkpoint hooks + if trainer.global_rank == 0: + melkdir = os.path.join(logdir, "melk") + os.makedirs(melkdir, exist_ok=True) + print0(f"[bold red]\[main][/bold red] Saving checkpoint to {melkdir}") + if melk_ckpt_name is None: + ckpt_path = os.path.join(melkdir, "last.ckpt") + else: + ckpt_path = os.path.join(melkdir, melk_ckpt_name) + trainer.save_checkpoint(ckpt_path) + + def divein(*args, **kwargs): + if trainer.global_rank == 0: + import pudb + pudb.set_trace() + + import signal + signal.signal(signal.SIGUSR1, melk) + signal.signal(signal.SIGUSR2, divein) + + # run + if opt.train: + try: + trainer.fit(model, data, ckpt_path=ckpt_resume_path) + print0(f"[bold red]\[main][/bold red] Finish training with logdir: {logdir}") + except Exception as e: + print(f"") + print(f"[bold red]\[main][/bold red] Exception: {e}") + print(f"[bold red]\[main][/bold red] Beijing Time {datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai'))}") + if not opt.debug: + melk() + raise + else: + trainer.validate(model, data, ckpt_path=ckpt_resume_path) + exit() + if not opt.no_test and not trainer.interrupted: + trainer.test(model, data) + except RuntimeError as err: + if MULTINODE_HACKS: + import datetime + import os + import socket + import requests + + device = os.environ.get("CUDA_VISIBLE_DEVICES", "?") + hostname = socket.gethostname() + ts = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + resp = requests.get("http://169.254.169.254/latest/meta-data/instance-id") + print( + f"[bold red]\[main][/bold red] ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}", + flush=True, + ) + raise err + except Exception: + if opt.debug and trainer.global_rank == 0: + try: + import pudb as debugger + except ImportError: + import pdb as debugger + # debugger.post_mortem() + raise + finally: + # move newly created debug project to debug_runs + if opt.debug and not opt.resume and trainer.global_rank == 0: + dst, name = os.path.split(logdir) + dst = os.path.join(dst, "debug_runs", name) + os.makedirs(os.path.split(dst)[0], exist_ok=True) + os.rename(logdir, dst) + + if opt.wandb: + wandb.finish() + + # clean up + # dist.barrier() + # torch.cuda.empty_cache() + dist.destroy_process_group() + + if trainer.global_rank == 0 and opt.debug: + print0(f"[bold red]\[main][/bold red] Current logdir: {logdir}") + # print0(f"[bold red]\[main][/bold red] Profiler summary:") + # print(trainer.profiler.summary()) + print0(f"[bold red]\[main][/bold red] Memory summary:") + num_params = sum([p.numel() for p in model.parameters()]) + print0(f"[bold red]\[main][/bold red] Expected bf16 memory usage from params: {num_params * 2 / 1e9:.2f} GB") + print0(f"[bold red]\[main][/bold red] Current memory usage with model on device {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") + # trainer.print(torch.cuda.memory_summary()) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc b/Meissonic/VidTok/vidtok_cache/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c26fbccfd6bb7ce7052966d21ee38c2b0bc24ba7 Binary files /dev/null and b/Meissonic/VidTok/vidtok_cache/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/scripts/inference_evaluate.py b/Meissonic/VidTok/vidtok_cache/VidTok/scripts/inference_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..d4152046727305af7b32e0a3e5728072b7085ca4 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/scripts/inference_evaluate.py @@ -0,0 +1,198 @@ +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import warnings +warnings.filterwarnings("ignore") + +import time +from contextlib import nullcontext +from omegaconf import OmegaConf +from torch import autocast +from tqdm import tqdm + +import numpy as np +import torch +from einops import rearrange +from lightning.pytorch import seed_everything + +from vidtok.data.vidtok import VidTokValDataset +from vidtok.modules.lpips import LPIPS +from vidtok.modules.util import (compute_psnr, compute_ssim, + instantiate_from_config, print0) + + +def load_model_from_config(config, ckpt, ignore_keys=[], verbose=False): + config = OmegaConf.load(config) + config.model.params.ckpt_path = ckpt + config.model.params.ignore_keys = ignore_keys + config.model.params.verbose = verbose + model = instantiate_from_config(config.model) + return model + + +class MultiVideoDataset(VidTokValDataset): + def __init__( + self, + data_dir, + meta_path=None, + input_height=256, + input_width=256, + sample_fps=30, + chunk_size=16, + is_causal=True, + read_long_video=False + ): + super().__init__( + data_dir=data_dir, + meta_path=meta_path, + video_params={ + "input_height": input_height, + "input_width": input_width, + "sample_num_frames": chunk_size + 1 if is_causal else chunk_size, + "sample_fps": sample_fps, + }, + pre_load_frames=True, + last_frames_handle="repeat", + read_long_video=read_long_video, + chunk_size=chunk_size, + is_causal=is_causal, + ) + + def __getitem__(self, idx): + frames = super().__getitem__(idx)["jpg"] + return frames + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--data_dir", + type=str, + default="./", + help="root folder", + ) + parser.add_argument( + "--meta_path", + type=str, + default=None, + help="path to the .csv meta file", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=16, + help="the size of a chunk - we split a long video into several chunks", + ) + parser.add_argument( + "--read_long_video", + action='store_true' + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[scripts.inference_evaluate][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + assert args.chunk_size % model.encoder.time_downsample_factor == 0 + + + if args.read_long_video: + assert hasattr(model, 'use_tiling'), "Tiling inference is needed to conduct long video reconstruction." + print(f"Using tiling inference to save memory usage...") + model.enable_tiling() + model.t_chunk_enc = args.chunk_size + model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor + + if args.input_width > 256: + model.enable_tiling() + + dataset = MultiVideoDataset( + data_dir=args.data_dir, + meta_path=args.meta_path, + input_height=args.input_height, + input_width=args.input_width, + sample_fps=args.sample_fps, + chunk_size=args.chunk_size, + is_causal=model.is_causal, + read_long_video=args.read_long_video + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + perceptual_loss = LPIPS().eval() + perceptual_loss = perceptual_loss.to(device) + + psnrs, ssims, lpipss = [], [], [] + + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, output, reg_log = model(input) + output = output.clamp(-1, 1) + input, output = map(lambda x: (x + 1) / 2, (input, output)) + + if input.dim() == 5: + input = rearrange(input, "b c t h w -> (b t) c h w") + assert output.dim() == 5 + output = rearrange(output, "b c t h w -> (b t) c h w") + + for inp, out in zip(torch.split(input, 16), torch.split(output, 16)): + psnrs += [compute_psnr(inp, out).item()] * inp.shape[0] + ssims += [compute_ssim(inp, out).item()] * inp.shape[0] + lpipss += [perceptual_loss(inp * 2 - 1, out * 2 - 1).mean().item()] * inp.shape[0] + + toc = time.time() + print0( + f"[bold red]\[scripts.inference_evaluate][/bold red] PSNR: {np.mean(psnrs):.4f}, SSIM: {np.mean(ssims):.4f}, LPIPS: {np.mean(lpipss):.4f}" + ) + print0(f"[bold red]\[scripts.inference_evaluate][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/scripts/inference_reconstruct.py b/Meissonic/VidTok/vidtok_cache/VidTok/scripts/inference_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..3a26b9475339f6675e01fd052637c8465ca37caf --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/scripts/inference_reconstruct.py @@ -0,0 +1,246 @@ +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import warnings +warnings.filterwarnings("ignore") + +import time +from contextlib import nullcontext +from omegaconf import OmegaConf +from pathlib import Path +from tqdm import tqdm + +import numpy as np +import torch +import decord +from einops import rearrange +from lightning.pytorch import seed_everything +from torch import autocast +from torchvision import transforms +from torchvision.io import write_video + +from vidtok.modules.util import print0 +from scripts.inference_evaluate import load_model_from_config + + +class SingleVideoDataset(torch.utils.data.Dataset): + def __init__( + self, + video_path, + input_height=128, + input_width=128, + sample_fps=8, + chunk_size=16, + is_causal=True, + read_long_video=False + ): + decord.bridge.set_bridge("torch") + self.video_path = video_path + self.transform = transforms.Compose( + [ + transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + ] + ) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + if read_long_video: + video_length = len(frame_ids) + if is_causal and video_length > chunk_size: + self.frame_ids_batch.append(frame_ids[:chunk_size * ((video_length - 1) // chunk_size) + 1]) + elif not is_causal and video_length >= chunk_size: + self.frame_ids_batch.append(frame_ids[:chunk_size * (video_length // chunk_size)]) + else: + num_frames_per_batch = chunk_size + 1 if is_causal else chunk_size + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x : x + num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x : x + num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255.0 + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=16, + help="the size of a chunk - we split a long video into several chunks", + ) + parser.add_argument( + "--read_long_video", + action='store_true' + ) + parser.add_argument( + "--pad_gen_frames", + action="store_true", + help="Used only in causal mode. If True, pad frames generated in the last batch, else replicate the first frame instead", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[scripts.inference_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + config = OmegaConf.load(args.config) + + os.makedirs(args.output_video_dir, exist_ok=True) + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + assert args.chunk_size % model.encoder.time_downsample_factor == 0 + + if args.read_long_video: + assert hasattr(model, 'use_tiling'), "Tiling inference is needed to conduct long video reconstruction." + print(f"Using tiling inference to save memory usage...") + model.use_tiling = True + model.t_chunk_enc = args.chunk_size + model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor + model.use_overlap = True + + dataset = SingleVideoDataset( + video_path=args.input_video_path, + input_height=args.input_height, + input_width=args.input_width, + sample_fps=args.sample_fps, + chunk_size=args.chunk_size, + is_causal=model.is_causal, + read_long_video=args.read_long_video + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + inputs = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + + if model.is_causal and not args.read_long_video and args.pad_gen_frames: + if i == 0: + _, xrec, _ = model(input) + else: + _, xrec, _ = model(torch.cat([last_gen_frames, input], dim=2)) + xrec = xrec[:, :, -input.shape[2]:].clamp(-1, 1) + last_gen_frames = xrec[:, :, (1 - model.encoder.time_downsample_factor):, :, :] + else: + _, xrec, _ = model(input) + + input = rearrange(input, "b c t h w -> (b t) c h w") + inputs.append(input) + xrec = rearrange(xrec.clamp(-1, 1), "b c t h w -> (b t) c h w") + outputs.append(xrec) + + toc = time.time() + + # save the outputs as videos + inputs = tensor_to_uint8(torch.cat(inputs, dim=0)) + inputs = rearrange(inputs, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs.shape[0], outputs.shape[0]) + final = np.concatenate([inputs[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"{Path(args.input_video_path).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + + print0(f"[bold red]Results saved in: {output_video_path}[/bold red]") + print0(f"[bold red]\[scripts.inference_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f33d1eb5f9eb0a67085dce41389212b2f3e07c5 Binary files /dev/null and b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/datamodule.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..c405b84703735b9a076b9ffb4e74de673470c15b --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/datamodule.py @@ -0,0 +1,150 @@ +import numpy as np +from functools import partial + +import torch +import lightning.pytorch as pl +from torch.utils.data import DataLoader, Dataset, IterableDataset + +from vidtok.modules.util import instantiate_from_config + + +class WrappedDataset(Dataset): + """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset""" + + def __init__(self, dataset): + self.data = dataset + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + +def worker_init_fn(_): + worker_info = torch.utils.data.get_worker_info() + + dataset = worker_info.dataset + worker_id = worker_info.id + + if isinstance(dataset, IterableDataset): + split_size = dataset.num_records // worker_info.num_workers + # reset num_records to the true number to retain reliable length information + dataset.sample_ids = dataset.valid_ids[ + worker_id * split_size : (worker_id + 1) * split_size + ] + current_id = np.random.choice(len(np.random.get_state()[1]), 1) + return np.random.seed(np.random.get_state()[1][current_id] + worker_id) + else: + return np.random.seed(np.random.get_state()[1][0] + worker_id) + + +class DataModuleFromConfig(pl.LightningDataModule): + def __init__( + self, + batch_size, + train=None, + validation=None, + test=None, + predict=None, + wrap=False, + num_workers=None, + pin_train_memory=True, + is_iterable_dataset=False, + shuffle_test_loader=False, + use_worker_init_fn=False, + shuffle_val_dataloader=False, + ): + super().__init__() + self.batch_size = batch_size + self.dataset_configs = dict() + self.num_workers = num_workers if num_workers is not None else batch_size * 2 + self.pin_train_memory = pin_train_memory + self.is_iterable_dataset = is_iterable_dataset + self.use_worker_init_fn = use_worker_init_fn + if train is not None: + self.dataset_configs["train"] = train + self.train_dataloader = self._train_dataloader + if validation is not None: + self.dataset_configs["validation"] = validation + self.val_dataloader = partial( + self._val_dataloader, shuffle=shuffle_val_dataloader + ) + if test is not None: + self.dataset_configs["test"] = test + self.test_dataloader = partial( + self._test_dataloader, shuffle=shuffle_test_loader + ) + if predict is not None: + self.dataset_configs["predict"] = predict + self.predict_dataloader = self._predict_dataloader + self.wrap = wrap + + def prepare_data(self): + for data_cfg in self.dataset_configs.values(): + instantiate_from_config(data_cfg) + + def setup(self, stage=None): + self.datasets = dict( + (k, instantiate_from_config(self.dataset_configs[k])) + for k in self.dataset_configs + ) + if self.wrap: + for k in self.datasets: + self.datasets[k] = WrappedDataset(self.datasets[k]) + + def _train_dataloader(self): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["train"], + batch_size=self.batch_size, + num_workers=self.num_workers, + pin_memory=self.pin_train_memory, + shuffle=False if self.is_iterable_dataset else True, + worker_init_fn=init_fn, + ) + + def _val_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["validation"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + shuffle=shuffle, + ) + + def _test_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + + # do not shuffle dataloader for iterable dataset + shuffle = shuffle and (not self.is_iterable_dataset) + + return DataLoader( + self.datasets["test"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + shuffle=shuffle, + ) + + def _predict_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["predict"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + ) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/video_read.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/video_read.py new file mode 100644 index 0000000000000000000000000000000000000000..357cd48305141ee70924fd6a79adbe5cb7f6ca5b --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/video_read.py @@ -0,0 +1,88 @@ +import os +import random +import decord +import numpy as np +import torch + +from vidtok.modules.util import print0 + +decord.bridge.set_bridge("torch") + + +def sample_frames_with_fps( + total_frames, + video_fps, + sample_num_frames, + sample_fps, + start_index=None +): + """sample frames proportional to the length of the frames in one second + e.g., 1s video has 30 frames, when 'fps'=3, we sample frames with spacing of 30/3=10 + return the frame indices + + Parameters + ---------- + total_frames : length of the video + video_fps : original fps of the video + sample_num_frames : number of frames to sample + sample_fps : the fps to sample frames + start_index : the starting frame index. If it is not None, it will be used as the starting frame index + + Returns + ------- + frame indices + """ + sample_num_frames = min(sample_num_frames, total_frames) + interval = round(video_fps / sample_fps) + frames_range = (sample_num_frames - 1) * interval + 1 + + if start_index is not None: + start = start_index + elif total_frames - frames_range - 1 < 0: + start = 0 + else: + start = random.randint(0, total_frames - frames_range - 1) + + frame_idxs = np.linspace( + start=start, stop=min(total_frames - 1, start + frames_range), num=sample_num_frames + ).astype(int) + + return frame_idxs + + +def read_frames_with_decord( + video_path, + sample_num_frames, + sample_fps, + start_index=None +) -> tuple[torch.Tensor, list[int]]: + """read frames from video path using decord + + Parameters + ---------- + video_path : path to video + sample_num_frames : number of frames to sample + sample_fps : the fps to sample frames + start_index : the starting frame index. If it is not None, it will be used as the starting frame index + + Returns + ------- + frames (tensor 0~1), frame indices + """ + video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(video_reader) + video_fps = video_reader.get_avg_fps() # note that the fps here is float. + frame_idxs = sample_frames_with_fps( + total_frames=total_frames, + video_fps=video_fps, + sample_num_frames=sample_num_frames, + sample_fps=sample_fps, + start_index=start_index + ) + frames = video_reader.get_batch(frame_idxs) + frames = frames.float() / 255 + frames = frames.permute(0, 3, 1, 2) + if (frames.shape[0] != sample_num_frames) or (len(frame_idxs) != sample_num_frames): + print0(f"[bold yellow]\[vidtok.data.video_read][read_frames_with_decord][/bold yellow] Warning: need {sample_num_frames} frames, " + f"but got {frames.shape[0]} frames, {len(frame_idxs)} frame indices, video_path={video_path}.") + return frames, frame_idxs diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/vidtok.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/vidtok.py new file mode 100644 index 0000000000000000000000000000000000000000..4b1898d4718de1450da96f25e9134dd4e1084cc1 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/data/vidtok.py @@ -0,0 +1,333 @@ +import os +import glob +from typing import Union + +import decord +import numpy as np +import pandas as pd +import torch +from PIL import Image +from torch.utils.data import Dataset +from torchvision.transforms import v2 +from tqdm import trange + +from vidtok.modules.util import print0 +from .video_read import read_frames_with_decord + + +class VidTokDataset(Dataset): + def __init__( + self, + data_dir: str, + meta_path: str, + video_params: dict, + data_frac: float = 1.0, + is_strict_loading: bool = False, + skip_missing_files: bool = True, + start_index: Union[None, int] = None + ): + super().__init__() + + self.data_dir = data_dir + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Use data dir: {self.data_dir}") + + self.meta_path = meta_path + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Use meta path: {self.meta_path}") + + self.video_params = video_params + + self.data_frac = data_frac + self.is_strict_loading = is_strict_loading + self.skip_missing_files = skip_missing_files + self.start_index = start_index + self.transforms = self._get_transforms( + video_params["input_height"], + video_params["input_width"], + ) + + self.missing_files = [] + self._load_metadata() + + def _get_transforms(self, input_height, input_width, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]): + normalize = v2.Normalize(mean=norm_mean, std=norm_std) + return v2.Compose( + [ + v2.Resize(input_height, antialias=True), + v2.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + def _load_metadata(self): + metadata = pd.read_csv( + self.meta_path, + on_bad_lines="skip", + encoding="ISO-8859-1", + engine="python", + sep=",",) + + if self.data_frac < 1: + metadata = metadata.sample(frac=self.data_frac) + self.metadata = metadata + self.metadata.dropna(inplace=True) + + def _get_video_path(self, sample): + """reduce the access to the disk + """ + rel_video_fp = str(sample["videos"]) + abs_video_fp = os.path.join(self.data_dir, rel_video_fp) + return abs_video_fp, rel_video_fp + + def __len__(self): + return len(self.metadata) + + def __getitem__(self, item): + item = item % len(self.metadata) + sample = self.metadata.iloc[item] + video_fp, _ = self._get_video_path(sample) + + try: + if os.path.isfile(video_fp): + imgs, idxs = read_frames_with_decord( + video_path=video_fp, + sample_num_frames=self.video_params["sample_num_frames"], + sample_fps=self.video_params["sample_fps"], + start_index=self.start_index + ) + else: + # if the video file is missing + if video_fp not in self.missing_files: + self.missing_files.append(video_fp) + # resample another video or not + if self.skip_missing_files: + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Warning: missing video file {video_fp}. Resampling another video.") + return self.__getitem__(np.random.choice(self.__len__())) + else: + raise ValueError(f"Video file {video_fp} is missing, skip_missing_files={self.skip_missing_files}.") + except Exception as e: + # if the video exists, but loading failed + if self.is_strict_loading: + raise ValueError(f"Video loading failed for {video_fp}, is_strict_loading={self.is_strict_loading}.") from e + else: + print0("[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Warning: using the pure black image as the frame sample") + imgs = Image.new("RGB", (self.video_params["input_width"], self.video_params["input_height"]), (0, 0, 0)) + imgs = v2.ToTensor()(imgs).unsqueeze(0) + + if self.transforms is not None: + # imgs: (T, C, H, W) + imgs = self.transforms(imgs) + + if imgs.shape[0] < self.video_params["sample_num_frames"]: + imgs = torch.cat([imgs, imgs[-1].unsqueeze(0).repeat(self.video_params["sample_num_frames"] - imgs.shape[0], 1, 1, 1)], dim=0) + + imgs = imgs.permute(1, 0, 2, 3) # (C, T, H, W) + + return { + 'jpg': imgs, + "path": video_fp + } + + +class VidTokValDataset(Dataset): + def __init__( + self, + data_dir: str, + video_params: dict, + meta_path: Union[None, str] = None, + pre_load_frames: bool = True, + is_strict_loading: bool = True, + last_frames_handle: str = "repeat", # 'repeat', 'drop' + skip_missing_files: bool = False, + read_long_video: bool = False, + chunk_size: int = 16, + is_causal: bool = True, + ): + super().__init__() + + self.data_dir = data_dir + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Use data dir: {self.data_dir}" + ) + + self.meta_path = meta_path + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Use meta path: {self.meta_path}" + ) + + self.video_params = video_params + self.read_long_video = read_long_video + self.chunk_size = chunk_size + self.is_causal = is_causal + + self.is_strict_loading = is_strict_loading + self.last_frames_handle = last_frames_handle + self.skip_missing_files = skip_missing_files + self.transforms = self._get_transforms( + video_params["input_height"], + video_params["input_width"], + ) + + self.missing_files = [] + self._load_metadata() + self._load_every_frame_from_meta() + + if pre_load_frames: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Pre-loading all frames into CPU..." + ) + self._pre_load_frames() + + def _get_transforms(self, input_height, input_width, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]): + normalize = v2.Normalize(mean=norm_mean, std=norm_std) + return v2.Compose( + [ + v2.Resize(input_height, antialias=True), + v2.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + def _load_metadata(self): + if self.meta_path is not None: + metadata = pd.read_csv( + self.meta_path, + on_bad_lines="skip", + encoding="ISO-8859-1", + engine="python", + sep=",", + ) + self.metadata = metadata + self.metadata.dropna(inplace=True) + else: + self.metadata = glob.glob(os.path.join(self.data_dir, '**', '*.mp4'), recursive=True) + + def _load_every_frame_from_meta(self): + decord.bridge.set_bridge("torch") + self.frames_batch = [] + for video_idx in range(len(self.metadata)): + try: + sample = self.metadata.iloc[video_idx] + video_fp, _ = self._get_video_path(sample) + except: + video_fp = self.metadata[video_idx] + if os.path.isfile(video_fp): + video_reader = decord.VideoReader(video_fp, num_threads=0) + total_frames = len(video_reader) + fps = video_reader.get_avg_fps() # float + interval = round(fps / self.video_params["sample_fps"]) + frame_ids = list(range(0, total_frames, interval)) + + if self.read_long_video: + video_length = len(frame_ids) + if self.is_causal and video_length > self.chunk_size: + num_frames_ids = frame_ids[:self.chunk_size * ((video_length - 1) // self.chunk_size) + 1] + elif not self.is_causal and video_length >= self.chunk_size: + num_frames_ids = frame_ids[:self.chunk_size * (video_length // self.chunk_size)] + else: + continue + self.frames_batch.append( + { + "video_fp": video_fp, + "num_frames_ids": num_frames_ids, + } + ) + else: + for x in range(0, len(frame_ids), self.video_params["sample_num_frames"]): + num_frames_ids = frame_ids[x : x + self.video_params["sample_num_frames"]] + if len(num_frames_ids) < self.video_params["sample_num_frames"]: + if self.last_frames_handle == "repeat": + num_frames_ids += [num_frames_ids[-1]] * ( + self.video_params["sample_num_frames"] - len(num_frames_ids) + ) + elif self.last_frames_handle == "drop": + continue + else: + raise ValueError(f"Invalid last_frames_handle: {self.last_frames_handle}") + self.frames_batch.append( + { + "video_fp": video_fp, + "num_frames_ids": num_frames_ids, + } + ) + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Loaded all frames index from {len(self.metadata)} videos." + ) + + def _pre_load_frames(self): + last_video_fp = None + for idx in trange(len(self.frames_batch), desc="Pre-loading all frames"): + if self.frames_batch[idx]["video_fp"] != last_video_fp: + video_reader = decord.VideoReader(self.frames_batch[idx]["video_fp"], num_threads=0) + last_video_fp = self.frames_batch[idx]["video_fp"] + self.frames_batch[idx]["frames"] = ( + video_reader.get_batch(self.frames_batch[idx]["num_frames_ids"]).permute(0, 3, 1, 2).float() + / 255.0 + ) + + def _get_video_path(self, sample): + """reduce the access to the disk""" + rel_video_fp = str(sample["videos"]) + abs_video_fp = os.path.join(self.data_dir, rel_video_fp) + return abs_video_fp, rel_video_fp + + def __len__(self): + return len(self.frames_batch) + + def __getitem__(self, item): + video_fp = self.frames_batch[item]["video_fp"] + + try: + if "frames" in self.frames_batch[item]: + imgs = self.frames_batch[item]["frames"] + elif os.path.isfile(video_fp): + video_reader = decord.VideoReader(video_fp, num_threads=0) + imgs = ( + video_reader.get_batch(self.frames_batch[item]["num_frames_ids"]).permute(0, 3, 1, 2).float() + / 255.0 + ) + else: + # if the video file is missing + if video_fp not in self.missing_files: + self.missing_files.append(video_fp) + # resample another video or not + if self.skip_missing_files: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: missing video file {video_fp}. Resampling another video." + ) + return self.__getitem__(np.random.choice(self.__len__())) + else: + raise ValueError(f"Video file {video_fp} is missing, skip_missing_files={self.skip_missing_files}.") + except Exception as e: + # if the video exists, but loading failed + if self.is_strict_loading: + raise ValueError( + f"Video loading failed for {video_fp}, is_strict_loading={self.is_strict_loading}." + ) from e + else: + print0( + "[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: using the pure black image as the frame sample" + ) + imgs = Image.new( + "RGB", (self.video_params["input_width"], self.video_params["input_height"]), (0, 0, 0) + ) + imgs = v2.ToTensor()(imgs).unsqueeze(0) + + if self.transforms is not None: + imgs = self.transforms(imgs) + + if not self.read_long_video: + if imgs.shape[0] < self.video_params["sample_num_frames"]: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: video {video_fp} has less frames {imgs.shape[0]} than sample_num_frames {self.video_params['sample_num_frames']}." + ) + imgs = torch.cat( + [imgs, imgs[-1].unsqueeze(0).repeat(self.video_params["sample_num_frames"] - imgs.shape[0], 1, 1, 1)], + dim=0, + ) + + imgs = imgs.permute(1, 0, 2, 3) # (C, T, H, W) + + return { + "jpg": imgs, + "path": video_fp, + } diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/models/autoencoder.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/models/autoencoder.py new file mode 100644 index 0000000000000000000000000000000000000000..96da5e6c74621b4f82538453e850567245adbbb0 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/models/autoencoder.py @@ -0,0 +1,517 @@ +import re +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union, Optional, List +from omegaconf import ListConfig +from packaging import version + +import torch +import lightning.pytorch as pl + +from safetensors.torch import load_file as load_safetensors +from vidtok.modules.ema import LitEma +from vidtok.modules.util import (default, get_obj_from_str, + instantiate_from_config, print0) +from vidtok.modules.regularizers import pack_one, unpack_one, rearrange + + +class AbstractAutoencoder(pl.LightningModule): + """ + This is the base class for all autoencoders + """ + + def __init__( + self, + ema_decay: Union[None, float] = None, + monitor: Union[None, str] = None, + mode: Union[None, str] = None, + input_key: str = "jpg", + ): + super().__init__() + + self.input_key = input_key + self.use_ema = ema_decay is not None + self.ema_decay = ema_decay + if monitor is not None: + self.monitor = monitor + if mode is not None: + self.mode = mode + + if version.parse(torch.__version__) >= version.parse("2.0.0"): + self.automatic_optimization = False + + @abstractmethod + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + raise NotImplementedError() + + @abstractmethod + def get_input(self, batch) -> Any: + raise NotImplementedError() + + def on_train_batch_end(self, *args, **kwargs): + # for EMA computation + if self.use_ema: + self.model_ema(self) + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Switched to EMA weights" + ) + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Restored training weights" + ) + + @abstractmethod + def encode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class called" + ) + + @abstractmethod + def decode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class called" + ) + + def instantiate_optimizer_from_config(self, params, lr, cfg): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> {cfg['target']} <<< optimizer from config" + ) + return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict())) + + @abstractmethod + def configure_optimizers(self) -> Any: + raise NotImplementedError() + + +class AutoencodingEngine(AbstractAutoencoder): + """ + Base class for all video tokenizers that we train + """ + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + verbose = kwargs.pop("verbose", True) + super().__init__(*args, **kwargs) + + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + self.optimizer_config = default(optimizer_config, {"target": "torch.optim.Adam"}) + self.lr_g_factor = lr_g_factor + self.is_causal = self.encoder.is_causal + + self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds) + self.use_tiling = False + # Decode more latent frames at once + self.num_sample_frames_batch_size = 16 + self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = 256 + self.tile_sample_min_width = 256 + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = 0 # 1 / 8 + self.tile_overlap_factor_width = 0 # 1 / 8 + + if self.use_ema: + self.model_ema = LitEma(self, decay=self.ema_decay) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of {len(list(self.model_ema.buffers()))}." + ) + + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: {ckpt_path}" + ) + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, verbose=verbose) + + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + if path.endswith("ckpt"): + ckpt = torch.load(path, map_location="cpu") + weights = ckpt["state_dict"] if "state_dict" in ckpt else ckpt + elif path.endswith("safetensors"): + weights = load_safetensors(path) + else: + raise NotImplementedError(f"Unknown checkpoint: {path}") + + keys = list(weights.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key {k} from state_dict." + ) + del weights[k] + + missing, unexpected = self.load_state_dict(weights, strict=False) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if verbose: + if len(missing) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: {missing}" + ) + if len(unexpected) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: {unexpected}" + ) + + def get_input(self, batch: Dict) -> torch.Tensor: + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(filter(lambda p: p.requires_grad, self.encoder.parameters())) + + list(filter(lambda p: p.requires_grad, self.decoder.parameters())) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + self.use_tiling = False + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + if self.use_tiling: + z = self.tile_encode(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + else: + z = self.encoder(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + + if return_reg_log: + return z, reg_log + return z + + def tile_encode(self, x: Any) -> Any: + + num_frames, height, width = x.shape[-3:] + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = [[0, num_frames]] + result_z = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile = self.encoder(tile) + result_z.append(tile) + + row.append(torch.cat(result_z, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + enc = torch.cat(result_rows, dim=3) + + return enc + + def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + token_indices = rearrange(token_indices, "... -> ... 1") + token_indices, ps = pack_one(token_indices, "b * d") + codes = self.regularization.indices_to_codes(token_indices) + codes = rearrange(codes, "b d n c -> b n (c d)") + z = self.regularization.project_out(codes) + z = unpack_one(z, ps, "b * d") + z = rearrange(z, "b ... d -> b d ...") + return z + + def decode(self, z: Any, decode_from_indices: bool = False) -> torch.Tensor: + if decode_from_indices: + z = self.indices_to_latent(z) + if self.use_tiling: + x = self.tile_decode(z) + else: + x = self.decoder(z) + return x + + def tile_decode(self, z: Any) -> torch.Tensor: + + num_frames, height, width = z.shape[-3:] + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = [[0, num_frames]] + time = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + tile = z[ + :, + :, + start_frame : end_frame, + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + tile = self.decoder(tile) + if self.is_causal and end_frame + 1 <= num_frames: + tile = tile[:, :, : -self.encoder.time_downsample_factor] + time.append(tile) + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + return dec + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.encoder.fix_encoder: + with torch.no_grad(): + z, reg_log = self.encode(x, return_reg_log=True) + else: + z, reg_log = self.encode(x, return_reg_log=True) + + dec = self.decode(z) + return z, dec, reg_log + + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + opt_g, opt_d = self.optimizers() + + # autoencode loss + self.toggle_optimizer(opt_g) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + + # gradient clip + torch.nn.utils.clip_grad_norm_(self.get_autoencoder_params(), 20.0) + opt_g.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + torch.nn.utils.clip_grad_norm_(self.get_discriminator_params(), 20.0) + opt_d.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + lr = opt_g.param_groups[0]["lr"] + self.log( + "lr_abs", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + ) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + return log_dict_ae + + def configure_optimizers(self) -> Any: + ae_params = self.get_autoencoder_params() + disc_params = self.get_discriminator_params() + + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + opt_disc = self.instantiate_optimizer_from_config(disc_params, self.learning_rate, self.optimizer_config) + + return [opt_ae, opt_disc], [] + + @torch.no_grad() + def log_images(self, batch: Dict) -> Dict: + log = dict() + x = self.get_input(batch) + _, xrec, _ = self(x) + log["inputs"] = x + log["recs"] = xrec + with self.ema_scope(): + _, xrec_ema, _ = self(x) + log["recs_ema"] = xrec_ema + return log diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/models/autoencoder_v1_1.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/models/autoencoder_v1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1182f573eee05b634daeb9a065b7adaacc600b --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/models/autoencoder_v1_1.py @@ -0,0 +1,588 @@ +import re +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union, Optional, List +from omegaconf import ListConfig +from packaging import version + +import torch +import lightning.pytorch as pl + +from safetensors.torch import load_file as load_safetensors +from vidtok.modules.ema import LitEma +from vidtok.modules.util import (default, get_obj_from_str, + instantiate_from_config, print0) +from vidtok.modules.regularizers import pack_one, unpack_one, rearrange + + +class AbstractAutoencoder(pl.LightningModule): + """ + This is the base class for all autoencoders + """ + + def __init__( + self, + ema_decay: Union[None, float] = None, + monitor: Union[None, str] = None, + mode: Union[None, str] = None, + input_key: str = "jpg", + ): + super().__init__() + + self.input_key = input_key + self.use_ema = ema_decay is not None + self.ema_decay = ema_decay + if monitor is not None: + self.monitor = monitor + if mode is not None: + self.mode = mode + + if version.parse(torch.__version__) >= version.parse("2.0.0"): + self.automatic_optimization = False + + @abstractmethod + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + raise NotImplementedError() + + @abstractmethod + def get_input(self, batch) -> Any: + raise NotImplementedError() + + def on_train_batch_end(self, *args, **kwargs): + # for EMA computation + if self.use_ema: + self.model_ema(self) + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Switched to EMA weights" + ) + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Restored training weights" + ) + + @abstractmethod + def encode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class called" + ) + + @abstractmethod + def decode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class called" + ) + + def instantiate_optimizer_from_config(self, params, lr, cfg): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> {cfg['target']} <<< optimizer from config" + ) + return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict())) + + @abstractmethod + def configure_optimizers(self) -> Any: + raise NotImplementedError() + + +class AutoencodingEngine(AbstractAutoencoder): + """ + Base class for all video tokenizers that we train + """ + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + use_tiling: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + verbose = kwargs.pop("verbose", True) + self.use_tiling = kwargs.pop("use_tiling", False) + self.t_chunk_enc = kwargs.pop("t_chunk_enc", 16) + super().__init__(*args, **kwargs) + + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + self.optimizer_config = default(optimizer_config, {"target": "torch.optim.Adam"}) + self.lr_g_factor = lr_g_factor + + self.t_chunk_dec = self.t_chunk_enc // self.encoder.time_downsample_factor + self.use_overlap = False + self.is_causal = self.encoder.is_causal + + self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds) + + self.use_tiling = use_tiling + # Decode more latent frames at once + self.num_sample_frames_batch_size = 16 + self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio + + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = 256 + self.tile_sample_min_width = 256 + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = 0 # 1 / 8 + self.tile_overlap_factor_width = 0 # 1 / 8 + + if self.use_ema: + self.model_ema = LitEma(self, decay=self.ema_decay) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of {len(list(self.model_ema.buffers()))}." + ) + + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: {ckpt_path}" + ) + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, verbose=verbose) + + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + if path.endswith("ckpt"): + ckpt = torch.load(path, map_location="cpu") + weights = ckpt["state_dict"] if "state_dict" in ckpt else ckpt + elif path.endswith("safetensors"): + weights = load_safetensors(path) + else: + raise NotImplementedError(f"Unknown checkpoint: {path}") + + keys = list(weights.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key {k} from state_dict." + ) + del weights[k] + + missing, unexpected = self.load_state_dict(weights, strict=False) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if verbose: + if len(missing) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: {missing}" + ) + if len(unexpected) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: {unexpected}" + ) + + def get_input(self, batch: Dict) -> torch.Tensor: + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(filter(lambda p: p.requires_grad, self.encoder.parameters())) + + list(filter(lambda p: p.requires_grad, self.decoder.parameters())) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + def _empty_causal_cached(self, parent): + for name, module in parent.named_modules(): + if hasattr(module, 'causal_cache'): + module.causal_cache = None + + def _set_first_chunk(self, is_first_chunk=True): + for module in self.modules(): + if hasattr(module, 'is_first_chunk'): + module.is_first_chunk = is_first_chunk + + def _set_cache_offset(self, modules, cache_offset=0): + for module in modules: + for submodule in module.modules(): + if hasattr(submodule, 'cache_offset'): + submodule.cache_offset = cache_offset + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def build_chunk_start_end(self, t, decoder_mode=False): + start_end = [[0, 1]] + start = 1 + end = start + while True: + if start >= t: + break + end = min(t, end + (self.t_chunk_dec if decoder_mode else self.t_chunk_enc)) + start_end.append([start, end]) + start = end + return start_end + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + self.use_tiling = False + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + self._empty_causal_cached(self.encoder) + self._set_first_chunk(True) + + if self.use_tiling: + z = self.tile_encode(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + else: + z = self.encoder(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + + if return_reg_log: + return z, reg_log + return z + + def tile_encode(self, x: Any) -> Any: + + num_frames, height, width = x.shape[-3:] + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + rows = [] + + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = self.build_chunk_start_end(num_frames) + result_z = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + self._set_first_chunk(idx == 0) + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile = self.encoder(tile) + result_z.append(tile) + row.append(torch.cat(result_z, dim=2)) + rows.append(row) + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + enc = torch.cat(result_rows, dim=3) + + return enc + + def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + assert token_indices.dim() == 4, "token_indices should be of shape (b, t, h, w)" + b, t, h, w = token_indices.shape + token_indices = token_indices.unsqueeze(-1).reshape(b, -1, 1) + codes = self.regularization.indices_to_codes(token_indices) + codes = codes.permute(0, 2, 3, 1).reshape(b, codes.shape[2], -1) + z = self.regularization.project_out(codes) + return z.reshape(b, t, h, w, -1).permute(0, 4, 1, 2, 3) + + def tile_indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + num_frames = token_indices.shape[1] + start_end = self.build_chunk_start_end(num_frames, decoder_mode=True) + result_z = [] + for (start, end) in start_end: + chunk = token_indices[:, start:end, :, :] + chunk_z = self.indices_to_latent(chunk) + result_z.append(chunk_z.clone()) + return torch.cat(result_z, dim=2) + + def decode(self, z: Any, decode_from_indices: bool = False) -> torch.Tensor: + if decode_from_indices: + if self.use_tiling: + z = self.tile_indices_to_latent(z) + else: + z = self.indices_to_latent(z) + self._empty_causal_cached(self.decoder) + self._set_first_chunk(True) + + if self.use_tiling: + x = self.tile_decode(z) + else: + x = self.decoder(z) + return x + + + def tile_decode(self, z: Any) -> torch.Tensor: + + num_frames, height, width = z.shape[-3:] + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + if self.is_causal: + assert self.encoder.time_downsample_factor in [2, 4, 8], "Only support 2x, 4x or 8x temporal downsampling now." + if self.encoder.time_downsample_factor == 4: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 2) + self._set_cache_offset([self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out], 4) + elif self.encoder.time_downsample_factor == 2: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1], self.decoder.up_temporal[0], self.decoder.conv_out], 2) + else: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[3].upsample, self.decoder.up_temporal[2]], 2) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 4) + self._set_cache_offset([self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out], 8) + + start_end = self.build_chunk_start_end(num_frames, decoder_mode=True) + time = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + self._set_first_chunk(idx == 0) + tile = z[ + :, + :, + start_frame : (end_frame + 1 if self.is_causal and end_frame + 1 <= num_frames else end_frame), + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + tile = self.decoder(tile) + if self.is_causal and end_frame + 1 <= num_frames: + tile = tile[:, :, : -self.encoder.time_downsample_factor] + time.append(tile) + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + return dec + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.encoder.fix_encoder: + with torch.no_grad(): + z, reg_log = self.encode(x, return_reg_log=True) + else: + z, reg_log = self.encode(x, return_reg_log=True) + dec = self.decode(z) + if dec.shape[2] != x.shape[2]: + dec = dec[:, :, -x.shape[2]:, ...] + return z, dec, reg_log + + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + opt_g, opt_d = self.optimizers() + + # autoencode loss + self.toggle_optimizer(opt_g) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + + # gradient clip + torch.nn.utils.clip_grad_norm_(self.get_autoencoder_params(), 20.0) + opt_g.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + torch.nn.utils.clip_grad_norm_(self.get_discriminator_params(), 20.0) + opt_d.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + lr = opt_g.param_groups[0]["lr"] + self.log( + "lr_abs", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + ) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + return log_dict_ae + + def configure_optimizers(self) -> Any: + ae_params = self.get_autoencoder_params() + disc_params = self.get_discriminator_params() + + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + opt_disc = self.instantiate_optimizer_from_config(disc_params, self.learning_rate, self.optimizer_config) + + return [opt_ae, opt_disc], [] + + @torch.no_grad() + def log_images(self, batch: Dict) -> Dict: + log = dict() + x = self.get_input(batch) + _, xrec, _ = self(x) + log["inputs"] = x + log["recs"] = xrec + with self.ema_scope(): + _, xrec_ema, _ = self(x) + log["recs_ema"] = xrec_ema + return log \ No newline at end of file diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2441aebdb3844a7ef2103c501213cd4537abea6b Binary files /dev/null and b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc differ diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/discriminator.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/discriminator.py new file mode 100644 index 0000000000000000000000000000000000000000..f9d94b21b22f5019f3cdcc4cbf2e98bd0ce0ee02 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/discriminator.py @@ -0,0 +1,201 @@ +import functools + +import torch +import torch.nn as nn + + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + nn.init.normal_(m.weight.data, 0.0, 0.02) + elif classname.find("BatchNorm") != -1: + nn.init.normal_(m.weight.data, 1.0, 0.02) + nn.init.constant_(m.bias.data, 0) + + +class ActNorm(nn.Module): + def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False): + assert affine + super().__init__() + self.logdet = logdet + self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1)) + self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1)) + self.allow_reverse_init = allow_reverse_init + + self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8)) + + def initialize(self, input): + with torch.no_grad(): + flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1) + mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3) + std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3) + + self.loc.data.copy_(-mean) + self.scale.data.copy_(1 / (std + 1e-6)) + + def forward(self, input, reverse=False): + if reverse: + return self.reverse(input) + if len(input.shape) == 2: + input = input[:, :, None, None] + squeeze = True + else: + squeeze = False + + _, _, height, width = input.shape + + if self.training and self.initialized.item() == 0: + self.initialize(input) + self.initialized.fill_(1) + + h = self.scale * (input + self.loc) + + if squeeze: + h = h.squeeze(-1).squeeze(-1) + + if self.logdet: + log_abs = torch.log(torch.abs(self.scale)) + logdet = height * width * torch.sum(log_abs) + logdet = logdet * torch.ones(input.shape[0]).to(input) + return h, logdet + + return h + + def reverse(self, output): + if self.training and self.initialized.item() == 0: + if not self.allow_reverse_init: + raise RuntimeError( + "Initializing ActNorm in reverse direction is " + "disabled by default. Use allow_reverse_init=True to enable." + ) + else: + self.initialize(output) + self.initialized.fill_(1) + + if len(output.shape) == 2: + output = output[:, :, None, None] + squeeze = True + else: + squeeze = False + + h = output / self.scale - self.loc + + if squeeze: + h = h.squeeze(-1).squeeze(-1) + return h + + +class NLayerDiscriminator(nn.Module): + """Defines a PatchGAN discriminator as in Pix2Pix.""" + # https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py + def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): + """Construct a PatchGAN discriminator + Parameters: + input_nc (int) -- the number of channels in input images + ndf (int) -- the number of filters in the last conv layer + n_layers (int) -- the number of conv layers in the discriminator + """ + super(NLayerDiscriminator, self).__init__() + if not use_actnorm: + norm_layer = nn.BatchNorm2d + else: + norm_layer = ActNorm + if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters + use_bias = norm_layer.func != nn.BatchNorm2d + else: + use_bias = norm_layer != nn.BatchNorm2d + + kw = 4 + padw = 1 + sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + nf_mult = 1 + nf_mult_prev = 1 + for n in range(1, n_layers): # gradually increase the number of filters + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + sequence += [ + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + nf_mult_prev = nf_mult + nf_mult = min(2**n_layers, 8) + sequence += [ + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + sequence += [ + nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + ] # output 1 channel prediction map + self.main = nn.Sequential(*sequence) + + def forward(self, input): + """Standard forward.""" + return self.main(input) + + +class NLayerDiscriminator3D(nn.Module): + """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs.""" + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/losses/discriminator.py + def __init__(self, input_nc=1, ndf=64, n_layers=3, use_actnorm=False): + """ + Construct a 3D PatchGAN discriminator + + Parameters: + input_nc (int) -- the number of channels in input volumes + ndf (int) -- the number of filters in the last conv layer + n_layers (int) -- the number of conv layers in the discriminator + use_actnorm (bool) -- flag to use actnorm instead of batchnorm + """ + super(NLayerDiscriminator3D, self).__init__() + if not use_actnorm: + norm_layer = nn.BatchNorm3d + else: + raise NotImplementedError("Not implemented.") + if type(norm_layer) == functools.partial: + use_bias = norm_layer.func != nn.BatchNorm3d + else: + use_bias = norm_layer != nn.BatchNorm3d + + kw = 3 + padw = 1 + sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + nf_mult = 1 + nf_mult_prev = 1 + for n in range(1, n_layers): # gradually increase the number of filters + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + sequence += [ + nn.Conv3d( + ndf * nf_mult_prev, + ndf * nf_mult, + kernel_size=(kw, kw, kw), + stride=(2 if n == 1 else 1, 2, 2), + padding=padw, + bias=use_bias, + ), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + nf_mult_prev = nf_mult + nf_mult = min(2**n_layers, 8) + sequence += [ + nn.Conv3d( + ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=1, padding=padw, bias=use_bias + ), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + sequence += [ + nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + ] # output 1 channel prediction map + self.main = nn.Sequential(*sequence) + + def forward(self, input): + """Standard forward.""" + return self.main(input) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/distributions.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..76e814475d4d32b9f5ead736cce3a234bb5a0e5f --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/distributions.py @@ -0,0 +1,49 @@ +import numpy as np +import torch + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.0]) + else: + if other is None: + return 0.5 * torch.sum( + torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3], + ) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + dim=[1, 2, 3], + ) + + def nll(self, sample, dims=[1, 2, 3]): + if self.deterministic: + return torch.Tensor([0.0]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims, + ) + + def mode(self): + return self.mean diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/ema.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..9f1f7606c2c9b68ebd2302215a9e08f9f31ed8ab --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/ema.py @@ -0,0 +1,82 @@ +import torch +from torch import nn + + +class LitEma(nn.Module): + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError("Decay must be between 0 and 1") + + self.m_name2s_name = {} + self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32)) + self.register_buffer( + "num_updates", + torch.tensor(0, dtype=torch.int) if use_num_upates else torch.tensor(-1, dtype=torch.int), + ) + + for name, p in model.named_parameters(): + if p.requires_grad: + # remove as '.'-character is not allowed in buffers + s_name = name.replace(".", "") + self.m_name2s_name.update({name: s_name}) + self.register_buffer(s_name, p.clone().detach().data) + + self.collected_params = [] + + def reset_num_updates(self): + del self.num_updates + self.register_buffer("num_updates", torch.tensor(0, dtype=torch.int)) + + def forward(self, model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) + + one_minus_decay = 1.0 - decay + + with torch.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if m_param[key].requires_grad: + sname = self.m_name2s_name[key] + shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) + shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) + else: + assert not key in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if m_param[key].requires_grad: + m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) + else: + assert not key in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.data.copy_(c_param.data) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/logger.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..23c7d066e6508433cb2141c59dbd80cb0030ab6d --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/logger.py @@ -0,0 +1,289 @@ +import os +import numpy as np +import einops +import imageio +from typing import Union +from matplotlib import pyplot as plt +from PIL import Image, ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True # UnidentifiedImageError: https://github.com/python-pillow/Pillow/issues/5631 +from pathlib import Path + +import torch +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +from .util import exists, isheatmap + + +class ImageVideoLogger(Callback): + def __init__( + self, + batch_frequency, + max_samples, + clamp=True, + increase_log_steps=True, + batch_frequency_val=None, + video_fps=8, + rescale=True, + disabled=False, + log_on_batch_idx=True, # log on batch_idx instead of global_step. global_step is fixed in validation. batch_idx restarts at each validation + log_first_step=True, + log_images_kwargs=None, + log_videos_kwargs=None, + log_before_first_step=True, + enable_autocast=True, + ): + super().__init__() + self.enable_autocast = enable_autocast + self.rescale = rescale + self.batch_freq = batch_frequency + self.batch_freq_val = batch_frequency_val if batch_frequency_val is not None else batch_frequency + self.video_fps = video_fps + self.max_samples = max_samples + self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)] + if not increase_log_steps: + self.log_steps = [self.batch_freq] + self.clamp = clamp + self.disabled = disabled + self.log_on_batch_idx = log_on_batch_idx + self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} + self.log_videos_kwargs = log_videos_kwargs if log_videos_kwargs else {} + self.log_first_step = log_first_step + self.log_before_first_step = log_before_first_step + + @rank_zero_only + def log_img_local( + self, + save_dir, + split, + images, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "images", split) + for k in images: + if isheatmap(images[k]): + fig, ax = plt.subplots() + ax = ax.matshow( + images[k].cpu().numpy(), cmap="hot", interpolation="lanczos" + ) + plt.colorbar(ax) + plt.axis("off") + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + plt.savefig(path) + plt.close() + else: + if images[k].ndim == 5: + images[k] = einops.rearrange(images[k], "b c t h w -> (b t) c h w") + nrow = self.log_images_kwargs.get("n_rows", 8) + grid = torchvision.utils.make_grid(images[k], nrow=nrow) + if self.rescale: + grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) + grid = grid.numpy() + grid = (grid * 255).astype(np.uint8) + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + path = os.path.join(root, filename) + os.makedirs(os.path.split(path)[0], exist_ok=True) + img = Image.fromarray(grid) + img.save(path) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "logger_log_image only supports WandbLogger currently" + pl_module.logger.log_image( + key=f"{split}/{k}", + images=[ + img, + ], + step=pl_module.global_step, + ) + + @rank_zero_only + def log_vid_local( + self, + save_dir, + split, + videos, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "videos", split) + for k in videos: + # if is video, we can add captions + if isinstance(videos[k], torch.Tensor) and videos[k].ndim == 5: + if self.rescale: + videos[k] = (videos[k] + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + frames = [videos[k][:, :, i] for i in range(videos[k].shape[2])] + frames = [torchvision.utils.make_grid(each, nrow=4) for each in frames] + frames = [einops.rearrange(each, "c h w -> 1 c h w") for each in frames] + frames = torch.clamp(torch.cat(frames, dim=0), min=0.0, max=1.0) + frames = (frames.numpy() * 255).astype(np.uint8) + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.gif".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + save_numpy_as_gif(frames, path, duration=1 / self.video_fps) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "log_videos only supports WandbLogger currently" + wandb.log({f"{split}/{k}": wandb.Video(frames, fps=self.video_fps)}) # k is str + + @rank_zero_only + def log_img(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + (self.check_frequency(check_idx) or self.check_frequency_val(batch_idx, split)) + and hasattr(pl_module, "log_images") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_images) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + with torch.no_grad(), torch.autocast(enabled=self.enable_autocast, device_type="cuda"): + images = pl_module.log_images(batch) + + for k in images: + N = min(images[k].shape[0], self.max_samples) + if not isheatmap(images[k]): + images[k] = images[k][:N] + if isinstance(images[k], torch.Tensor): + images[k] = images[k].detach().float().cpu() + if self.clamp and not isheatmap(images[k]): + images[k] = torch.clamp(images[k], -1.0, 1.0) + + self.log_img_local( + pl_module.logger.save_dir, + split, + images, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + @rank_zero_only + def log_vid(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + (self.check_frequency(check_idx) or self.check_frequency_val(batch_idx, split)) + and hasattr(pl_module, "log_videos") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_videos) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + with torch.no_grad(), torch.autocast(enabled=self.enable_autocast, device_type="cuda"): + videos = pl_module.log_videos( + batch, split=split, **self.log_videos_kwargs + ) + + for k in videos: + N = min(videos[k].shape[0], self.max_samples) + videos[k] = videos[k][:N] + if isinstance(videos[k], torch.Tensor): + videos[k] = videos[k].detach().float().cpu() + if self.clamp: + videos[k] = torch.clamp(videos[k], -1.0, 1.0) + + self.log_vid_local( + pl_module.logger.save_dir, + split, + videos, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + def check_frequency(self, check_idx): + if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( + check_idx > 0 or self.log_first_step + ): + try: + self.log_steps.pop(0) + except IndexError as e: + pass + return True + return False + + def check_frequency_val(self, check_idx, split): + if 'val' in split: + if ((check_idx % self.batch_freq_val) == 0) and ( + check_idx > 0 or self.log_first_step): + return True + return False + + @rank_zero_only + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): + self.log_img(pl_module, batch, batch_idx, split="train") + self.log_vid(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + if self.log_before_first_step and pl_module.global_step == 0: + self.log_img(pl_module, batch, batch_idx, split="train") + self.log_vid(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs + ): + if not self.disabled and pl_module.global_step > 0: + self.log_img(pl_module, batch, batch_idx, split="val") + self.log_vid(pl_module, batch, batch_idx, split="val") + if hasattr(pl_module, "calibrate_grad_norm"): + if ( + pl_module.calibrate_grad_norm and batch_idx % 25 == 0 + ) and batch_idx > 0: + self.log_gradients(trainer, pl_module, batch_idx=batch_idx) + + +def save_numpy_as_gif(frames, path, duration=None): + """ + save numpy array as gif file + """ + image_list = [] + for frame in frames: + image = frame.transpose(1, 2, 0) + image_list.append(image) + if duration: + imageio.mimsave(path, image_list, format="GIF", duration=duration, loop=0) + else: + imageio.mimsave(path, image_list, format="GIF", loop=0) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/losses.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..c96f471c72124c56025b40a17f7d7bda81446ca5 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/losses.py @@ -0,0 +1,262 @@ +from typing import Any, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .discriminator import (NLayerDiscriminator, NLayerDiscriminator3D, + weights_init) +from .lpips import LPIPS +from .util import default, print0 + + +def hinge_d_loss(logits_real, logits_fake): + loss_real = torch.mean(F.relu(1.0 - logits_real)) + loss_fake = torch.mean(F.relu(1.0 + logits_fake)) + d_loss = 0.5 * (loss_real + loss_fake) + return d_loss + + +def vanilla_d_loss(logits_real, logits_fake): + d_loss = 0.5 * (torch.mean(F.softplus(-logits_real)) + torch.mean(F.softplus(logits_fake))) + return d_loss + + +def adopt_weight(weight, global_step, threshold=0, value=0.0): + if global_step < threshold: + weight = value + return weight + + +def _sigmoid_cross_entropy_with_logits(labels, logits): + """ + non-saturating loss + """ + zeros = torch.zeros_like(logits, dtype=logits.dtype) + condition = logits >= zeros + relu_logits = torch.where(condition, logits, zeros) + neg_abs_logits = torch.where(condition, -logits, logits) + return relu_logits - logits * labels + torch.log1p(torch.exp(neg_abs_logits)) + + +def non_saturate_gen_loss(logits_fake): + """ + logits_fake: [B 1 H W] + """ + B = logits_fake.shape[0] + logits_fake = logits_fake.reshape(B, -1) + logits_fake = torch.mean(logits_fake, dim=-1) + gen_loss = torch.mean(_sigmoid_cross_entropy_with_logits(labels=torch.ones_like(logits_fake), logits=logits_fake)) + return gen_loss + + +def lecam_reg(real_pred, fake_pred, lecam_ema): + reg = torch.mean(F.relu(real_pred - lecam_ema.logits_fake_ema).pow(2)) + torch.mean( + F.relu(lecam_ema.logits_real_ema - fake_pred).pow(2) + ) + return reg + + +class LeCAM_EMA(object): + # https://github.com/TencentARC/SEED-Voken/blob/main/src/Open_MAGVIT2/modules/losses/vqperceptual.py + def __init__(self, init=0.0, decay=0.999): + self.logits_real_ema = init + self.logits_fake_ema = init + self.decay = decay + + def update(self, logits_real, logits_fake): + self.logits_real_ema = self.logits_real_ema * self.decay + torch.mean(logits_real).item() * (1 - self.decay) + self.logits_fake_ema = self.logits_fake_ema * self.decay + torch.mean(logits_fake).item() * (1 - self.decay) + + +class GeneralLPIPSWithDiscriminator(nn.Module): + def __init__( + self, + disc_start: int, + logvar_init: float = 0.0, + pixelloss_weight=1.0, + disc_num_layers: int = 3, + disc_in_channels: int = 3, + disc_factor: float = 1.0, + disc_weight: float = 1.0, + disc_type: str = "3d", + perceptual_weight: float = 1.0, + lecam_loss_weight: float = 0.0, + disc_loss: str = "hinge", + scale_input_to_tgt_size: bool = False, + dims: int = 2, + learn_logvar: bool = False, + regularization_weights: Union[None, dict] = None, + gen_loss_cross_entropy: bool = False, + ): + super().__init__() + self.dims = dims + if self.dims > 2: + print0( + f"[bold cyan]\[vidtok.modules.losses][GeneralLPIPSWithDiscriminator][/bold cyan] running with dims={dims}. This means that for perceptual loss calculation, " + f"the LPIPS loss will be applied to each frame independently. " + ) + self.scale_input_to_tgt_size = scale_input_to_tgt_size + assert disc_loss in ["hinge", "vanilla"] + self.pixel_weight = pixelloss_weight + self.perceptual_loss = LPIPS().eval() + self.perceptual_weight = perceptual_weight + # output log variance + self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) + self.learn_logvar = learn_logvar + self.disc_type = disc_type + assert self.disc_type in ["2d", "3d"] + + if self.disc_type == "2d": + self.discriminator = NLayerDiscriminator( + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False + ).apply(weights_init) + else: + self.discriminator = NLayerDiscriminator3D( + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False + ).apply(weights_init) + self.discriminator_iter_start = disc_start + self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss + self.disc_factor = disc_factor + self.discriminator_weight = disc_weight + self.regularization_weights = default(regularization_weights, {}) + self.gen_loss_cross_entropy = gen_loss_cross_entropy + self.lecam_loss_weight = lecam_loss_weight + if self.lecam_loss_weight > 0: + self.lecam_ema = LeCAM_EMA() + + def get_trainable_parameters(self) -> Any: + return self.discriminator.parameters() + + def get_trainable_autoencoder_parameters(self) -> Any: + if self.learn_logvar: + yield self.logvar + yield from () + + def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): + if last_layer is not None: + nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] + g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] + else: + nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] + g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] + + d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) + d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() + d_weight = d_weight * self.discriminator_weight + return d_weight + + def forward( + self, + regularization_log, + inputs, + reconstructions, + optimizer_idx, + global_step, + last_layer=None, + split="train", + weights=None, + ): + if self.scale_input_to_tgt_size: + inputs = torch.nn.functional.interpolate(inputs, reconstructions.shape[2:], mode="bicubic", antialias=True) + + if optimizer_idx == 0: + bs = inputs.shape[0] + t = inputs.shape[2] + if self.dims > 2: + inputs, reconstructions = map( + lambda x: rearrange(x, "b c t h w -> (b t) c h w"), + (inputs, reconstructions), + ) + + rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) + if self.perceptual_weight > 0: + p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) + rec_loss = rec_loss + self.perceptual_weight * p_loss + else: + p_loss = torch.Tensor([0.0]) + + nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar + weighted_nll_loss = nll_loss + if weights is not None: + weighted_nll_loss = weights * nll_loss + weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] + nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] + + # now the GAN part + if self.disc_type == "3d": + reconstructions = rearrange(reconstructions, "(b t) c h w -> b c t h w", t=t).contiguous() + + # generator update + logits_fake = self.discriminator(reconstructions) + + if not self.gen_loss_cross_entropy: + g_loss = -torch.mean(logits_fake) + else: + g_loss = non_saturate_gen_loss(logits_fake) + + if self.disc_factor > 0.0: + try: + d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) + except RuntimeError: + assert not self.training + d_weight = torch.tensor(0.0) + else: + d_weight = torch.tensor(0.0) + + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + loss = weighted_nll_loss + d_weight * disc_factor * g_loss + log = dict() + for k in regularization_log: + if k in self.regularization_weights: + loss = loss + self.regularization_weights[k] * regularization_log[k] + log[f"{split}/{k}"] = regularization_log[k].detach().mean() + + log.update( + { + "{}/total_loss".format(split): loss.clone().detach().mean(), + "{}/logvar".format(split): self.logvar.detach(), + "{}/nll_loss".format(split): nll_loss.detach().mean(), + "{}/rec_loss".format(split): rec_loss.detach().mean(), + "{}/p_loss".format(split): p_loss.detach().mean(), + "{}/d_weight".format(split): d_weight.detach(), + "{}/disc_factor".format(split): torch.tensor(disc_factor), + "{}/g_loss".format(split): g_loss.detach().mean(), + } + ) + return loss, log + + if optimizer_idx == 1: + if self.disc_type == "2d" and self.dims > 2: + inputs, reconstructions = map( + lambda x: rearrange(x, "b c t h w -> (b t) c h w"), + (inputs, reconstructions), + ) + + logits_real = self.discriminator(inputs.contiguous().detach()) + logits_fake = self.discriminator(reconstructions.contiguous().detach()) + + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + + non_saturate_d_loss = self.disc_loss(logits_real, logits_fake) + + if self.lecam_loss_weight > 0: + self.lecam_ema.update(logits_real, logits_fake) + lecam_loss = lecam_reg(logits_real, logits_fake, self.lecam_ema) + d_loss = disc_factor * (lecam_loss * self.lecam_loss_weight + non_saturate_d_loss) + else: + d_loss = disc_factor * non_saturate_d_loss + + log = { + "{}/disc_loss".format(split): d_loss.clone().detach().mean(), + "{}/logits_real".format(split): logits_real.detach().mean(), + "{}/logits_fake".format(split): logits_fake.detach().mean(), + "{}/disc_factor".format(split): torch.tensor(disc_factor), + "{}/non_saturated_d_loss".format(split): non_saturate_d_loss.detach(), + } + + if self.lecam_loss_weight > 0: + log.update({"{}/lecam_loss".format(split): lecam_loss.detach()}) + + return d_loss, log diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/lpips.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/lpips.py new file mode 100644 index 0000000000000000000000000000000000000000..22bb5fa315618c0d1f0463d67ea57c29083fe302 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/lpips.py @@ -0,0 +1,172 @@ +import hashlib +import os +from collections import namedtuple +from tqdm import tqdm + +import requests +import torch +import torch.nn as nn +from torchvision import models + +from .util import print0 + +URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"} + +CKPT_MAP = {"vgg_lpips": "vgg.pth"} + +MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"} + + +def download(url, local_path, chunk_size=1024): + os.makedirs(os.path.split(local_path)[0], exist_ok=True) + with requests.get(url, stream=True) as r: + total_size = int(r.headers.get("content-length", 0)) + with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: + with open(local_path, "wb") as f: + for data in r.iter_content(chunk_size=chunk_size): + if data: + f.write(data) + pbar.update(chunk_size) + + +def md5_hash(path): + with open(path, "rb") as f: + content = f.read() + return hashlib.md5(content).hexdigest() + + +def get_ckpt_path(name, root, check=False): + assert name in URL_MAP + path = os.path.join(root, CKPT_MAP[name]) + if os.path.exists(path) and not (check and not md5_hash(path) == MD5_MAP[name]): + print0( + "[bold cyan]\[vidtok.modules.lpips]\[get_ckpt_path][/bold cyan] Using existing path for {} model: {}".format( + name, path + ) + ) + return path + + # if not, download the model + print0( + "[bold cyan]\[vidtok.modules.lpips]\[get_ckpt_path][/bold cyan] Downloading {} model from {} to {}".format( + name, URL_MAP[name], path + ) + ) + download(URL_MAP[name], path) + md5 = md5_hash(path) + assert md5 == MD5_MAP[name], md5 + return path + + +class LPIPS(nn.Module): + # Learned perceptual metric + def __init__(self, use_dropout=True): + super().__init__() + self.scaling_layer = ScalingLayer() + self.chns = [64, 128, 256, 512, 512] # vg16 features + self.net = vgg16(pretrained=True, requires_grad=False) + self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) + self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) + self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) + self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) + self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) + self.load_from_pretrained() + for param in self.parameters(): + param.requires_grad = False + + def load_from_pretrained(self, name="vgg_lpips"): + ckpt = get_ckpt_path(name, "checkpoints/lpips") + self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False) + print0("[bold cyan]\[vidtok.modules.lpips][LPIPS][/bold cyan] loaded pretrained LPIPS loss from {}".format(ckpt)) + + def forward(self, input, target): + in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target)) + outs0, outs1 = self.net(in0_input), self.net(in1_input) + feats0, feats1, diffs = {}, {}, {} + lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] + for kk in range(len(self.chns)): + feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) + diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 + + res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))] + val = res[0] + for l in range(1, len(self.chns)): + val += res[l] + return val + + +class ScalingLayer(nn.Module): + def __init__(self): + super(ScalingLayer, self).__init__() + self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]) + self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]) + + def forward(self, inp): + return (inp - self.shift) / self.scale + + +class NetLinLayer(nn.Module): + """A single linear layer which does a 1x1 conv""" + + def __init__(self, chn_in, chn_out=1, use_dropout=False): + super(NetLinLayer, self).__init__() + layers = ( + [ + nn.Dropout(), + ] + if (use_dropout) + else [] + ) + layers += [ + nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), + ] + self.model = nn.Sequential(*layers) + + +class vgg16(torch.nn.Module): + def __init__(self, requires_grad=False, pretrained=True): + super(vgg16, self).__init__() + vgg_pretrained_features = models.vgg16(pretrained=pretrained).features + self.slice1 = torch.nn.Sequential() + self.slice2 = torch.nn.Sequential() + self.slice3 = torch.nn.Sequential() + self.slice4 = torch.nn.Sequential() + self.slice5 = torch.nn.Sequential() + self.N_slices = 5 + for x in range(4): + self.slice1.add_module(str(x), vgg_pretrained_features[x]) + for x in range(4, 9): + self.slice2.add_module(str(x), vgg_pretrained_features[x]) + for x in range(9, 16): + self.slice3.add_module(str(x), vgg_pretrained_features[x]) + for x in range(16, 23): + self.slice4.add_module(str(x), vgg_pretrained_features[x]) + for x in range(23, 30): + self.slice5.add_module(str(x), vgg_pretrained_features[x]) + if not requires_grad: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, X): + h = self.slice1(X) + h_relu1_2 = h + h = self.slice2(h) + h_relu2_2 = h + h = self.slice3(h) + h_relu3_3 = h + h = self.slice4(h) + h_relu4_3 = h + h = self.slice5(h) + h_relu5_3 = h + vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) + out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) + return out + + +def normalize_tensor(x, eps=1e-10): + norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True)) + return x / (norm_factor + eps) + + +def spatial_average(x, keepdim=True): + return x.mean([2, 3], keepdim=keepdim) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dcausal.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dcausal.py new file mode 100644 index 0000000000000000000000000000000000000000..c71a8b09e370451046eef9ba60315feab2459e35 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dcausal.py @@ -0,0 +1,885 @@ +from typing import Callable +from beartype import beartype +from beartype.typing import Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .util import checkpoint + + +def spatial_temporal_resblk(x, block_s, block_t, temb): + assert len(x.shape) == 5, "input should be 5D tensor, but got {}D tensor".format(len(x.shape)) + B, C, T, H, W = x.shape + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + x = block_s(x, temb) + x = einops.rearrange(x, "(b t) c h w -> b c t h w", b=B, t=T) + x = einops.rearrange(x, "b c t h w -> (b h w) c t") + x = block_t(x, temb) + x = einops.rearrange(x, "(b h w) c t -> b c t h w", b=B, h=H, w=W) + return x + + +def nonlinearity(x): + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32, norm_type="groupnorm"): + if norm_type == "groupnorm": + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + elif norm_type == "layernorm": + return LayerNorm(num_channels=in_channels, eps=1e-6) + + +def pad_at_dim(t, pad, dim=-1, pad_mode="constant", value=0.0): + assert pad_mode in ["constant", "replicate", "reflect"] + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + if pad_mode == "constant": + return F.pad(t, (*zeros, *pad), value=value) + return F.pad(t, (*zeros, *pad), mode=pad_mode) + + +def divisible_by(num, den): + return (num % den) == 0 + + +def is_odd(n): + return not divisible_by(n, 2) + + +def cast_tuple(t, length=1): + return t if isinstance(t, tuple) else ((t,) * length) + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class LayerNorm(nn.Module): + def __init__(self, num_channels, eps=1e-6, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True) + + def forward(self, x): + if x.dim() == 5: + x = rearrange(x, "b c t h w -> b t h w c") + x = self.norm(x) + x = rearrange(x, "b t h w c -> b c t h w") + elif x.dim() == 4: + x = rearrange(x, "b c h w -> b h w c") + x = self.norm(x) + x = rearrange(x, "b h w c -> b c h w") + else: + x = rearrange(x, "b c s -> b s c") + x = self.norm(x) + x = rearrange(x, "b s c -> b c s") + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__() + self.in_channels = in_channels + self.norm_type = norm_type + + self.norm = Normalize(in_channels, norm_type=self.norm_type) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x, **kwargs): + if self.use_checkpoint: + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x) + + def _forward(self, x, **kwargs): + h_ = x + h_ = self.attention(h_) + h_ = self.proj_out(h_) + return x + h_ + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + B = h_.shape[0] + h_ = rearrange(h_, "b c t h w -> (b t) c h w") + h_ = self.norm(h_) + h_ = rearrange(h_, "(b t) c h w -> b c t h w", b=B) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class CausalConv1d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: int, pad_mode="constant", **kwargs): + super().__init__() + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + self.pad_mode = pad_mode + self.time_pad = dilation * (kernel_size - 1) + (1 - stride) + self.time_causal_padding = (self.time_pad, 0) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant" + x = F.pad(x, self.time_causal_padding, mode=pad_mode) + return self.conv(x) + + +class CausalConv3d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", **kwargs): + super().__init__() + kernel_size = cast_tuple(kernel_size, 3) + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + dilation = cast_tuple(dilation, 3) + stride = cast_tuple(stride, 3) + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + assert is_odd(height_kernel_size) and is_odd(width_kernel_size) + + self.pad_mode = pad_mode + time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0]) + height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1]) + width_pad = dilation[2] * (height_kernel_size - 1) + (1 - stride[2]) + + self.time_pad = time_pad + self.time_causal_padding = ( + width_pad // 2, + width_pad - width_pad // 2, + height_pad // 2, + height_pad - height_pad // 2, + time_pad, + 0, + ) + + self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant" + + x = F.pad(x, self.time_causal_padding, mode=pad_mode) + return self.conv(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = CausalConv3d(in_channels, out_channels, 3, stride=(2, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 1, 0) + x1 = self.avg_pool(torch.nn.functional.pad(x, pad, mode="constant", value=0)) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.conv = CausalConv3d(in_channels, out_channels, 3) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode="nearest").to( + x.dtype + ) + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetCausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv3d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + else: + self.nin_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + ) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + return x + h + + +class ResnetCausalBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv1d(out_channels, out_channels, kernel_size=3, stride=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + else: + self.nin_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=1, stride=1) + + if zero_init: + self.conv2.conv.weight.data.zero_() + self.conv2.conv.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class EncoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_ds=None, + tempo_ds=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.norm_type = norm_type + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.is_causal = True + + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetCausalBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level in self.spatial_ds: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleResCausal2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls(block_in, norm_type=self.norm_type) + + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + ) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def forward(self, x): + temb = None + B, _, T, H, W = x.shape + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + + if i_level in self.spatial_ds: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal downsample + B, _, T, H, W = htmp.shape + if i_level in self.tempo_ds: + htmp = self.down_temporal[i_level].downsample(htmp) + + hs.append(htmp) + B, _, T, H, W = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = nonlinearity(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv_out(h) + + return h + + +class EncoderCausal3DPadding(EncoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.init_pad_mode = ignore_kwargs.get("init_pad_mode", "replicate") + self.time_padding = self.time_downsample_factor - 1 + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + video_len = x.shape[2] + if video_len % self.time_downsample_factor != 0: + x = pad_at_dim(x, (self.time_padding, 0), dim=2, pad_mode=self.init_pad_mode, value=0.0) + return super().forward(x) + + +class DecoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_us=None, + tempo_us=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.norm_type = norm_type + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us + self.tempo_us = [1, 2] if tempo_us is None else tempo_us + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.spatial_us: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetCausalBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleResCausal2x(block_in, block_in) + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def get_last_layer(self, **kwargs): + try: + return self.conv_out.conv.weight + except: + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, H, W = z.shape + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + + if i_level in self.spatial_us: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal upsample + B, _, T, H, W = h.shape + if i_level in self.tempo_us: + h = self.up_temporal[i_level].upsample(h) + B, _, T, H, W = h.shape + + # end + if self.give_pre_end: + return h + + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + + if self.tanh_out: + h = torch.tanh(h) + + return h + + +class DecoderCausal3DPadding(DecoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.time_padding = self.time_downsample_factor - 1 + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + x = super().forward(x) + return x[:, :, self.time_padding :, :, :] diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dcausal_v1_1.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dcausal_v1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..44397a135334a58cf5774fb899152f701d56a37d --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dcausal_v1_1.py @@ -0,0 +1,959 @@ +from typing import Callable +from beartype import beartype +from beartype.typing import Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .util import checkpoint + + +def spatial_temporal_resblk(x, block_s, block_t, temb): + assert len(x.shape) == 5, "input should be 5D tensor, but got {}D tensor".format(len(x.shape)) + B, C, T, H, W = x.shape + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + x = block_s(x, temb) + x = einops.rearrange(x, "(b t) c h w -> b c t h w", b=B, t=T) + x = einops.rearrange(x, "b c t h w -> (b h w) c t") + x = block_t(x, temb) + x = einops.rearrange(x, "(b h w) c t -> b c t h w", b=B, h=H, w=W) + return x + + +def nonlinearity(x): + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32, norm_type="groupnorm"): + if norm_type == "groupnorm": + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + elif norm_type == "layernorm": + return LayerNorm(num_channels=in_channels, eps=1e-6) + + +def pad_at_dim(t, pad, dim=-1, pad_mode="constant", value=0.0): + assert pad_mode in ["constant", "replicate", "reflect"] + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + if pad_mode == "constant": + return F.pad(t, (*zeros, *pad), value=value) + return F.pad(t, (*zeros, *pad), mode=pad_mode) + + +def divisible_by(num, den): + return (num % den) == 0 + + +def is_odd(n): + return not divisible_by(n, 2) + + +def cast_tuple(t, length=1): + return t if isinstance(t, tuple) else ((t,) * length) + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class LayerNorm(nn.Module): + def __init__(self, num_channels, eps=1e-6, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True) + + def forward(self, x): + if x.dim() == 5: + x = rearrange(x, "b c t h w -> b t h w c") + x = self.norm(x) + x = rearrange(x, "b t h w c -> b c t h w") + elif x.dim() == 4: + x = rearrange(x, "b c h w -> b h w c") + x = self.norm(x) + x = rearrange(x, "b h w c -> b c h w") + else: + x = rearrange(x, "b c s -> b s c") + x = self.norm(x) + x = rearrange(x, "b s c -> b c s") + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__() + self.in_channels = in_channels + self.norm_type = norm_type + + self.norm = Normalize(in_channels, norm_type=self.norm_type) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x, **kwargs): + if self.use_checkpoint: + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x) + + def _forward(self, x, **kwargs): + h_ = x + h_ = self.attention(h_) + h_ = self.proj_out(h_) + return x + h_ + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + B = h_.shape[0] + h_ = rearrange(h_, "b c t h w -> (b t) c h w") + h_ = self.norm(h_) + h_ = rearrange(h_, "(b t) c h w -> b c t h w", b=B) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class CausalConv1d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: int, pad_mode="constant", **kwargs): + super().__init__() + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + self.pad_mode = pad_mode + self.time_pad = dilation * (kernel_size - 1) + (1 - stride) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + self.is_first_chunk = True + self.causal_cache = None + self.cache_offset = 0 + + def forward(self, x): + if self.is_first_chunk: + first_frame_pad = x[:, :, :1].repeat( + (1, 1, self.time_pad) + ) + else: + first_frame_pad = self.causal_cache + if self.time_pad != 0: + first_frame_pad = first_frame_pad[:, :, -self.time_pad:] + else: + first_frame_pad = first_frame_pad[:, :, 0:0] + + x = torch.concatenate((first_frame_pad, x), dim=2) + + if self.cache_offset == 0: + self.causal_cache = x.clone() + else: + self.causal_cache = x[:,:,:-self.cache_offset].clone() + + return self.conv(x) + + +class CausalConv3d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", **kwargs): + super().__init__() + kernel_size = cast_tuple(kernel_size, 3) + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + dilation = cast_tuple(dilation, 3) + stride = cast_tuple(stride, 3) + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + assert is_odd(height_kernel_size) and is_odd(width_kernel_size) + + self.pad_mode = pad_mode + time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0]) + height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1]) + width_pad = dilation[2] * (width_kernel_size - 1) + (1 - stride[2]) + + self.time_pad = time_pad + self.spatial_padding = ( + width_pad // 2, + width_pad - width_pad // 2, + height_pad // 2, + height_pad - height_pad // 2, + 0, + 0, + ) + + self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + self.is_first_chunk = True + self.causal_cache = None + self.cache_offset = 0 + + def forward(self, x): + if self.is_first_chunk: + first_frame_pad = x[:, :, :1, :, :].repeat( + (1, 1, self.time_pad, 1, 1) + ) + else: + first_frame_pad = self.causal_cache + if self.time_pad != 0: + first_frame_pad = first_frame_pad[:, :, -self.time_pad:] + else: + first_frame_pad = first_frame_pad[:, :, 0:0] + + x = torch.concatenate((first_frame_pad, x), dim=2) + + if self.cache_offset == 0: + self.causal_cache = x.clone() + else: + self.causal_cache = x[:,:,:-self.cache_offset].clone() + + x = F.pad(x, self.spatial_padding, mode=self.pad_mode) + return self.conv(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = CausalConv3d(in_channels, out_channels, 3, stride=(2, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + self.is_first_chunk = True + self.causal_cache = None + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 1, 0) + + if self.is_first_chunk: + x_pad = torch.nn.functional.pad(x, pad, mode="replicate") + else: + x_pad = torch.concatenate((self.causal_cache, x), dim=2) + + self.causal_cache = x_pad[:,:,-1:].clone() + + x1 = self.avg_pool(x_pad) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + interpolation_mode='nearest', + num_temp_upsample=1 + ): + super().__init__() + self.conv = CausalConv3d(in_channels, out_channels, 3) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + self.interpolation_mode = interpolation_mode + self.num_temp_upsample = num_temp_upsample + self.enable_cached = (self.interpolation_mode == 'trilinear') + self.is_first_chunk = True + self.causal_cache = None + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + if not self.enable_cached: + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + elif not self.is_first_chunk: + x = torch.cat([self.causal_cache, x], dim=2) + self.causal_cache = x[:, :, -2*self.num_temp_upsample:-self.num_temp_upsample].clone() + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + x = x[:, :, 2*self.num_temp_upsample:] + else: + self.causal_cache = x[:, :, -self.num_temp_upsample:].clone() + x, _x = x[:, :, :self.num_temp_upsample], x[:, :, self.num_temp_upsample:] + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + if _x.shape[-3] > 0: + _x = F.interpolate(_x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(_x.dtype) + x = torch.concat([x, _x], dim=2) + + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetCausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv3d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + else: + self.nin_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + ) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + return x + h + + +class ResnetCausalBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv1d(out_channels, out_channels, kernel_size=3, stride=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + else: + self.nin_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=1, stride=1) + + if zero_init: + self.conv2.conv.weight.data.zero_() + self.conv2.conv.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class EncoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_ds=None, + tempo_ds=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.norm_type = norm_type + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.is_causal = True + + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetCausalBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level in self.spatial_ds: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleResCausal2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls(block_in, norm_type=self.norm_type) + + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + ) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def forward(self, x): + temb = None + B, _, T, H, W = x.shape + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + + if i_level in self.spatial_ds: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal downsample + B, _, T, H, W = htmp.shape + if i_level in self.tempo_ds: + htmp = self.down_temporal[i_level].downsample(htmp) + + hs.append(htmp) + B, _, T, H, W = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = nonlinearity(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv_out(h) + + return h + + +class EncoderCausal3DPadding(EncoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.init_pad_mode = ignore_kwargs.get("init_pad_mode", "replicate") + + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + video_len = x.shape[2] + if video_len % self.time_downsample_factor != 0: + time_padding = self.time_downsample_factor - video_len % self.time_downsample_factor + x = pad_at_dim(x, (time_padding, 0), dim=2, pad_mode=self.init_pad_mode, value=0.0) + return super().forward(x) + + +class DecoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_us=None, + tempo_us=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.norm_type = norm_type + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + self.interpolation_mode = ignorekwargs.get("interpolation_mode", 'nearest') + assert self.interpolation_mode in ['nearest', 'trilinear'] + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us + self.tempo_us = [1, 2] if tempo_us is None else tempo_us + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.spatial_us: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + num_temp_upsample = 1 + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetCausalBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleResCausal2x(block_in, block_in, interpolation_mode=self.interpolation_mode, num_temp_upsample=num_temp_upsample) + num_temp_upsample *= 2 + + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def get_last_layer(self, **kwargs): + try: + return self.conv_out.conv.weight + except: + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, H, W = z.shape + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + + if i_level in self.spatial_us: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal upsample + B, _, T, H, W = h.shape + if i_level in self.tempo_us: + h = self.up_temporal[i_level].upsample(h) + B, _, T, H, W = h.shape + + # end + if self.give_pre_end: + return h + + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + + if self.tanh_out: + h = torch.tanh(h) + + return h + + +class DecoderCausal3DPadding(DecoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + x = super().forward(x) + return x diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dnoncausal.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dnoncausal.py new file mode 100644 index 0000000000000000000000000000000000000000..4223fb635eb88cd7e7292943c131965f2b814206 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/model_3dnoncausal.py @@ -0,0 +1,652 @@ +from typing import Callable + +import einops +import torch +import torch.nn as nn +from einops import rearrange + +from .model_3dcausal import (AttnBlock, Normalize, nonlinearity, + spatial_temporal_resblk) +from .util import checkpoint + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleRes2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = nn.Conv3d(in_channels, out_channels, 3, stride=(2, 1, 1), padding=(0, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x1 = self.avg_pool(x) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleRes2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.conv = nn.Conv3d(in_channels, out_channels, 3, padding=1) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + xlst = [ + torch.nn.functional.interpolate( + sx.unsqueeze(0).to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode="nearest" + ).to(x.dtype) + for sx in x + ] + x = torch.cat(xlst, dim=0) + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + if zero_init: + self.conv2.weight.data.zero_() + self.conv2.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetNoncausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = nn.Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = nn.Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=1) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class Encoder3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch=8, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] + self.spatial_ds = list(range(0, self.num_resolutions - 1)) # add for spatial tiling + self.norm_type = norm_type + self.is_causal = False + + # downsampling + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleRes2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn(block_in, norm_type=self.norm_type) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetNoncausalBlock + + def _make_conv(self) -> Callable: + return nn.Conv3d + + def forward(self, x): + temb = None + B, _, T, _, _ = x.shape + + # downsampling + if x.shape[1] == 4 and self.conv_in.in_channels == 3: + raise ValueError("Mismatched number of input channels") + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + if i_level != self.num_resolutions - 1: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + if i_level in self.tempo_ds: + # temporal downsample + htmp = self.down_temporal[i_level].downsample(htmp) + hs.append(htmp) + B, _, T, _, _ = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels=8, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + self.tempo_us = [1, 2] + self.norm_type = norm_type + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleRes2x(block_in, block_in) + + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1, padding=1) + + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetNoncausalBlock + + def _make_conv(self) -> Callable: + return nn.Conv3d + + def get_last_layer(self, **kwargs): + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, _, _ = z.shape + + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + if i_level != 0: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + if i_level in self.tempo_us: + # temporal upsample + h = self.up_temporal[i_level].upsample(h) + B, _, T, _, _ = h.shape + # end + if self.give_pre_end: + return h + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + if self.tanh_out: + h = torch.tanh(h) + return h diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/regularizers.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/regularizers.py new file mode 100644 index 0000000000000000000000000000000000000000..4f4f1a4fccdb873f640e637ecbaaaef389e75b41 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/regularizers.py @@ -0,0 +1,268 @@ +from abc import abstractmethod +from functools import cache +from typing import Any, List, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from einops import pack, rearrange, reduce, unpack +from torch import Tensor, int32 +from torch.cuda.amp import autocast + +from .distributions import DiagonalGaussianDistribution + + +def exists(v): + return v is not None + + +def default(*args): + for arg in args: + if exists(arg): + return arg + return None + + +def pack_one(t, pattern): + return pack([t], pattern) + + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + + +def round_ste(z: Tensor) -> Tensor: + """Round with straight through gradients.""" + zhat = z.round() + return z + (zhat - z).detach() + + +def log(t, eps=1e-5): + return t.clamp(min=eps).log() + + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + dist.all_reduce(t) + t = t / dist.get_world_size() + return t + + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + + +class AbstractRegularizer(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]: + raise NotImplementedError() + + @abstractmethod + def get_trainable_parameters(self) -> Any: + raise NotImplementedError() + + +class DiagonalGaussianRegularizer(AbstractRegularizer): + def __init__(self, sample: bool = True): + super().__init__() + self.sample = sample + + def get_trainable_parameters(self) -> Any: + yield from () + + def forward(self, z: torch.Tensor, n_steps=None) -> Tuple[torch.Tensor, dict]: + log = dict() + posterior = DiagonalGaussianDistribution(z) + if self.sample: + z = posterior.sample() + else: + z = posterior.mode() + kl_loss = posterior.kl() + kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] + log["kl_loss"] = kl_loss + return z, log + + +class FSQRegularizer(AbstractRegularizer): + # https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/finite_scalar_quantization.py + def __init__( + self, + levels: List[int], + dim: Optional[int] = None, + num_codebooks=1, + keep_num_codebooks_dim: Optional[bool] = None, + scale: Optional[float] = None, + entropy_loss_weight: float = 0.0, + entropy_loss_annealing_steps: int = 0, + entropy_loss_annealing_factor: float = 1.0, + commitment_loss_weight: float = 0.0, + diversity_gamma: float = 1.0, + ): + super().__init__() + _levels = torch.tensor(levels, dtype=int32) + self.register_buffer("_levels", _levels, persistent=False) + + _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32) + self.register_buffer("_basis", _basis, persistent=False) + + self.scale = scale + self.entropy_loss_weight = entropy_loss_weight + self.entropy_loss_annealing_steps = entropy_loss_annealing_steps + self.entropy_loss_annealing_factor = entropy_loss_annealing_factor + self.commitment_loss_weight = commitment_loss_weight + self.diversity_gamma = diversity_gamma + + codebook_dim = len(levels) + self.codebook_dim = codebook_dim + + effective_codebook_dim = codebook_dim * num_codebooks + self.num_codebooks = num_codebooks + self.effective_codebook_dim = effective_codebook_dim + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + self.dim = default(dim, len(_levels) * num_codebooks) + + has_projections = self.dim != effective_codebook_dim + self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity() + self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity() + self.has_projections = has_projections + + self.codebook_size = self._levels.prod().item() + + implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out=False) + self.register_buffer("implicit_codebook", implicit_codebook, persistent=False) + self.register_buffer("zero", torch.tensor(0.0), persistent=False) + + self.global_codebook_usage = torch.zeros([2**self.codebook_dim, self.num_codebooks], dtype=torch.long) + + def get_trainable_parameters(self) -> Any: + return self.parameters() + + def bound(self, z: Tensor, eps: float = 1e-3) -> Tensor: + """Bound `z`, an array of shape (..., d).""" + half_l = (self._levels - 1) * (1 + eps) / 2 + offset = torch.where(self._levels % 2 == 0, 0.5, 0.0) + shift = (offset / half_l).atanh() + return (z + shift).tanh() * half_l - offset + + def quantize(self, z: Tensor) -> Tensor: + """Quantizes z, returns quantized zhat, same shape as z.""" + quantized = round_ste(self.bound(z)) + half_width = self._levels // 2 + return quantized / half_width + + def _scale_and_shift(self, zhat_normalized: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat_normalized * half_width) + half_width + + def _scale_and_shift_inverse(self, zhat: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat - half_width) / half_width + + def codes_to_indices(self, zhat: Tensor) -> Tensor: + """Converts a `code` to an index in the codebook.""" + assert zhat.shape[-1] == self.codebook_dim + zhat = self._scale_and_shift(zhat) + return (zhat * self._basis).sum(dim=-1).to(int32) + + def indices_to_codes(self, indices: Tensor, project_out=True) -> Tensor: + """Inverse of `codes_to_indices`.""" + + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + + indices = rearrange(indices, "... -> ... 1") + codes_non_centered = (indices // self._basis) % self._levels + codes = self._scale_and_shift_inverse(codes_non_centered) + + if self.keep_num_codebooks_dim: + codes = rearrange(codes, "... c d -> ... (c d)") + + if project_out: + codes = self.project_out(codes) + + if is_img_or_video: + codes = rearrange(codes, "b ... d -> b d ...") + + return codes + + def calculate_entropy_loss_weight(self, n_steps): + if n_steps >= self.entropy_loss_annealing_steps: + return self.entropy_loss_weight + start = self.entropy_loss_annealing_factor * self.entropy_loss_weight + return start - (n_steps / self.entropy_loss_annealing_steps) * (start - self.entropy_loss_weight) + + @autocast(enabled=False) + def forward(self, z: Tensor, inv_temperature: float = 100.0, n_steps: int = 0) -> Tensor: + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension + c - number of codebook dim + """ + is_img_or_video = z.ndim >= 4 + if is_img_or_video: + z = rearrange(z, "b d ... -> b ... d") + z, ps = pack_one(z, "b * d") + + assert z.shape[-1] == self.dim, f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}" + + z = self.project_in(z) + z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks) + + with torch.autocast("cuda", enabled=False): + orig_dtype = z.dtype + z = z.float() + original_input = z + codes = self.quantize(z) + indices = self.codes_to_indices(codes) + + if self.entropy_loss_weight > 0 or self.commitment_loss_weight > 0: + # the same as euclidean distance up to a constant + distance = -2 * torch.einsum("... i d, j d -> ... i j", original_input, self.implicit_codebook) + prob = (-distance * inv_temperature).softmax(dim=-1) + per_sample_probs = rearrange(prob, "b n ... -> (b n) ...") + per_sample_entropy = entropy(per_sample_probs).mean() + # distribution over all available tokens in the batch + avg_prob = reduce(per_sample_probs, "... c d -> c d", "mean") + avg_prob = maybe_distributed_mean(avg_prob) + codebook_entropy = entropy(avg_prob).mean() + entropy_aux_loss = per_sample_entropy - self.diversity_gamma * codebook_entropy + # commit loss + commit_loss = F.mse_loss(original_input, codes.detach(), reduction="none") + commit_loss = commit_loss.mean() + else: + entropy_aux_loss = per_sample_entropy = codebook_entropy = commit_loss = self.zero + + codes = codes.type(orig_dtype) + + codes = rearrange(codes, "b n c d -> b n (c d)") + out = self.project_out(codes) + + # reconstitute image or video dimensions + if is_img_or_video: + out = unpack_one(out, ps, "b * d") + out = rearrange(out, "b ... d -> b d ...") + + indices = unpack_one(indices, ps, "b * c") + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, "... 1 -> ...") + + aux_loss = ( + entropy_aux_loss * self.calculate_entropy_loss_weight(n_steps) + commit_loss * self.commitment_loss_weight + ) + + return out, dict(indices=indices, aux_loss=aux_loss) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/util.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9570016221e47b50d536d30352b5718a29bb0009 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtok/modules/util.py @@ -0,0 +1,324 @@ +import importlib +import random +import os +import einops +import numpy as np +from inspect import isfunction +from rich import print +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from lightning.pytorch.utilities.rank_zero import rank_zero_only + + +def get_valid_dirs(dir1: str, dir2: str, dir3: Union[None, str] = None) -> Union[None, str]: + if (dir1 is not None) and os.path.isdir(dir1): + return dir1 + elif (dir2 is not None) and os.path.isdir(dir2): + return dir2 + elif (dir3 is not None) and os.path.isdir(dir3): + return dir3 + else: + return None + + +def get_valid_paths(path1: str, path2: str, path3: Union[None, str] = None) -> Union[None, str]: + if (path1 is not None) and os.path.isfile(path1): + return path1 + elif (path2 is not None) and os.path.isfile(path2): + return path2 + elif (path3 is not None) and os.path.isfile(path3): + return path3 + else: + return None + + +@rank_zero_only +def print0(*args, **kwargs): + print(*args, **kwargs) + + +def seed_anything(seed: int): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def isheatmap(x): + if not isinstance(x, torch.Tensor): + return False + + return x.ndim == 2 + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def instantiate_from_config(config): + if not "target" in config: + if config == "__is_first_stage__": + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + + +def get_obj_from_str(string, reload=False, invalidate_cache=True): + module, cls = string.rsplit(".", 1) + if invalidate_cache: + importlib.invalidate_caches() + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def checkpoint(func, inputs, params, flag): + # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(torch.autograd.Function): + # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + ctx.gpu_autocast_kwargs = { + "enabled": torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled(), + } + with torch.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] + # Ensure all tensors have requires_grad set to True + ctx.input_params = [p.requires_grad_(True) for p in ctx.input_params] + with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = torch.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads + + +def compute_psnr(x, y): + if x.dim() == 5: + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + assert y.dim() == 5 + y = einops.rearrange(y, "b c t h w -> (b t) c h w") + EPS = 1e-8 + mse = torch.mean((x - y) ** 2, dim=[1, 2, 3]) + psnr = -10 * torch.log10(mse + EPS) + return psnr.mean(dim=0) + + +def compute_ssim(x, y): + if x.dim() == 5: + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + assert y.dim() == 5 + y = einops.rearrange(y, "b c t h w -> (b t) c h w") + kernel_size = 11 + kernel_sigma = 1.5 + k1 = 0.01 + k2 = 0.03 + + f = max(1, round(min(x.size()[-2:]) / 256)) + if f > 1: + x = F.avg_pool2d(x, kernel_size=f) + y = F.avg_pool2d(y, kernel_size=f) + + kernel = gaussian_filter(kernel_size, kernel_sigma, device=x.device, dtype=x.dtype).repeat(x.size(1), 1, 1, 1) + + _compute_ssim_per_channel = _ssim_per_channel_complex if x.dim() == 5 else _ssim_per_channel + ssim_map, cs_map = _compute_ssim_per_channel(x=x, y=y, kernel=kernel, data_range=1, k1=k1, k2=k2) + ssim_val = ssim_map.mean(1) + + return ssim_val.mean(dim=0) + + +def _ssim_per_channel( + x: torch.Tensor, + y: torch.Tensor, + kernel: torch.Tensor, + data_range: Union[float, int] = 1.0, + k1: float = 0.01, + k2: float = 0.03, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Calculate Structural Similarity (SSIM) index for X and Y per channel. + + Args: + x: An input tensor. Shape :math:`(N, C, H, W)`. + y: A target tensor. Shape :math:`(N, C, H, W)`. + kernel: 2D Gaussian kernel. + data_range: Maximum value range of images (usually 1.0 or 255). + k1: Algorithm parameter, K1 (small constant, see [1]). + k2: Algorithm parameter, K2 (small constant, see [1]). + Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results. + + Returns: + Full Value of Structural Similarity (SSIM) index. + """ + if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): + raise ValueError( + f"Kernel size can't be greater than actual input size. " + f"Input size: {x.size()}. Kernel size: {kernel.size()}" + ) + + c1 = k1**2 + c2 = k2**2 + n_channels = x.size(1) + mu_x = F.conv2d(x, weight=kernel, stride=1, padding=0, groups=n_channels) + mu_y = F.conv2d(y, weight=kernel, stride=1, padding=0, groups=n_channels) + + mu_xx = mu_x**2 + mu_yy = mu_y**2 + mu_xy = mu_x * mu_y + + sigma_xx = F.conv2d(x**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xx + sigma_yy = F.conv2d(y**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_yy + sigma_xy = F.conv2d(x * y, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xy + + # Contrast sensitivity (CS) with alpha = beta = gamma = 1. + cs = (2.0 * sigma_xy + c2) / (sigma_xx + sigma_yy + c2) + + # Structural similarity (SSIM) + ss = (2.0 * mu_xy + c1) / (mu_xx + mu_yy + c1) * cs + + ssim_val = ss.mean(dim=(-1, -2)) + cs = cs.mean(dim=(-1, -2)) + return ssim_val, cs + + +def _ssim_per_channel_complex( + x: torch.Tensor, + y: torch.Tensor, + kernel: torch.Tensor, + data_range: Union[float, int] = 1.0, + k1: float = 0.01, + k2: float = 0.03, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel. + + Args: + x: An input tensor. Shape :math:`(N, C, H, W, 2)`. + y: A target tensor. Shape :math:`(N, C, H, W, 2)`. + kernel: 2-D gauss kernel. + data_range: Maximum value range of images (usually 1.0 or 255). + k1: Algorithm parameter, K1 (small constant, see [1]). + k2: Algorithm parameter, K2 (small constant, see [1]). + Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results. + + Returns: + Full Value of Complex Structural Similarity (SSIM) index. + """ + n_channels = x.size(1) + if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): + raise ValueError( + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " + f"Kernel size: {kernel.size()}" + ) + + c1 = k1**2 + c2 = k2**2 + + x_real = x[..., 0] + x_imag = x[..., 1] + y_real = y[..., 0] + y_imag = y[..., 1] + + mu1_real = F.conv2d(x_real, weight=kernel, stride=1, padding=0, groups=n_channels) + mu1_imag = F.conv2d(x_imag, weight=kernel, stride=1, padding=0, groups=n_channels) + mu2_real = F.conv2d(y_real, weight=kernel, stride=1, padding=0, groups=n_channels) + mu2_imag = F.conv2d(y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) + + mu1_sq = mu1_real.pow(2) + mu1_imag.pow(2) + mu2_sq = mu2_real.pow(2) + mu2_imag.pow(2) + mu1_mu2_real = mu1_real * mu2_real - mu1_imag * mu2_imag + mu1_mu2_imag = mu1_real * mu2_imag + mu1_imag * mu2_real + + compensation = 1.0 + + x_sq = x_real.pow(2) + x_imag.pow(2) + y_sq = y_real.pow(2) + y_imag.pow(2) + x_y_real = x_real * y_real - x_imag * y_imag + x_y_imag = x_real * y_imag + x_imag * y_real + + sigma1_sq = F.conv2d(x_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_sq + sigma2_sq = F.conv2d(y_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu2_sq + sigma12_real = F.conv2d(x_y_real, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_real + sigma12_imag = F.conv2d(x_y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_imag + sigma12 = torch.stack((sigma12_imag, sigma12_real), dim=-1) + mu1_mu2 = torch.stack((mu1_mu2_real, mu1_mu2_imag), dim=-1) + # Set alpha = beta = gamma = 1. + cs_map = (sigma12 * 2 + c2 * compensation) / (sigma1_sq.unsqueeze(-1) + sigma2_sq.unsqueeze(-1) + c2 * compensation) + ssim_map = (mu1_mu2 * 2 + c1 * compensation) / (mu1_sq.unsqueeze(-1) + mu2_sq.unsqueeze(-1) + c1 * compensation) + ssim_map = ssim_map * cs_map + + ssim_val = ssim_map.mean(dim=(-2, -3)) + cs = cs_map.mean(dim=(-2, -3)) + + return ssim_val, cs + + +def gaussian_filter( + kernel_size: int, sigma: float, device: Optional[str] = None, dtype: Optional[type] = None +) -> torch.Tensor: + r"""Returns 2D Gaussian kernel N(0,`sigma`^2) + Args: + kernel_size: Size of the kernel + sigma: Std of the distribution + device: target device for kernel generation + dtype: target data type for kernel generation + Returns: + gaussian_kernel: Tensor with shape (1, kernel_size, kernel_size) + """ + coords = torch.arange(kernel_size, dtype=dtype, device=device) + coords -= (kernel_size - 1) / 2.0 + + g = coords**2 + g = (-(g.unsqueeze(0) + g.unsqueeze(1)) / (2 * sigma**2)).exp() + + g /= g.sum() + return g.unsqueeze(0) diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/README.md b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e475c6eade763f9cef3b67a905dd738e1fb43fef --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/README.md @@ -0,0 +1,211 @@ + + +

+VidTwin VidTwin: Video VAE with Decoupled Structure and Dynamics (CVPR 2025) +

+

+ Yuchi Wang   + Junliang Guo   + Xinyi Xie   + Tianyu He   + Xu Sun   + Jiang Bian +

+ +
+ +
+ +[![arXiv](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2412.17726)   [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/microsoft/vidtwin)   [![Static Badge](https://img.shields.io/badge/Demo-Project_Page-yellow)](https://vidtwin.github.io/) + +

🔥 Check our Demo Page for enhanced visual experience.

+ + +
+
+
+ +We propose a novel and compact video autoencoder, VidTwin, that decouples video into two distinct latent spaces: **Structure latent vectors**, which capture overall content and global movement, and **Dynamics latent vectors**, which represent fine-grained details and rapid movements. + +Extensive experiments show that VidTwin achieves a high compression rate of 0.20% with high reconstruction quality (PSNR of 28.14 on the MCL-JCV dataset), and performs efficiently and effectively in downstream generative tasks. Moreover, our model demonstrates explainability and scalability, paving the way for future research in video latent representation and generation. +
+
+ +
+ +## Setup + +1. Our code is based on **VidTok**, so you will need to install the [required packages for VidTok](https://github.com/microsoft/VidTok?tab=readme-ov-file#setup) first. To do so, navigate to the VidTok folder and create the environment using the `environment.yaml` file: + +```bash +cd VidTok +# Prepare conda environment +conda env create -f environment.yaml +# Activate the environment +conda activate vidtok +``` + +2. After setting up VidTok, install the additional packages required for the VidTwin model: +```bash +pip install tranformers +pip install timm +pip install flash-attn --no-build-isolation +``` + + +## Training + +### Data Preparation + +We follow the same approach as **VidTok** to prepare the data. You can also find the Dataloader class in: `vidtok/data/vidtok.py`. This Dataloader is a general-purpose class for handling video data. You may customize it to suit your own dataset and specific use cases. + +1. Put all training videos under `DATA_DIR`: +``` +└── DATA_DIR + ├── subset1 + │ ├── videoname11.mp4 + │ └── videoname12.mp4 + ├── subset2 + │ ├── videoname21.mp4 + │ ├── videoname22.mp4 + │ └── subsubset1 + │ ├── videoname211.mp4 + │ └── videoname212.mp4 + └── ... +``` +2. Prepare a `.csv` meta file to record the relative paths of these videos with respect to `DATA_DIR`, like: +``` +videos +subset1/videoname11.mp4 +subset2/videoname21.mp4 +subset2/subsubset1/videoname211.mp4 +``` + +> Validation data is also prepared following the above steps. + +### Launch Training + +1. Specify the Configuration File + +Our code follows a **modular design**, allowing you to easily customize the model structure and training settings by modifying a configuration file. For the **VidTwin** model, we provide the following configuration file:`configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml`. + +- In the **Model** section of the configuration file, you can specify the model's structure and key hyperparameters. For instance, you can adjust the following settings: + +```yaml +model: + params: + expect_ch: 8 # the dimension of the Structure Latent, d_S + cont_num_blocks: 1 # downsample blocks of the Structure Latent, 1 -> h_S = 7, 2 -> h_S = 4, 3 -> h_S = 2 + downsample_motion: True + motion_num_blocks: 1 # downsample blocks of the Dynamics Latent, 1 -> h_D = 7, 2 -> h_D = 4, 3 -> h_D = 2 + d_dim: 8 # the dimension of the Dynamics Latent, d_D +``` + +- If you'd like to **fine-tune** the model from a pre-trained checkpoint instead of training from scratch, you can specify the `ckpt_path` parameter in the configuration file. + +```yaml +model: + params: + ckpt_path: PATH_TO_CHECKPOINT # train from existing checkpoint +``` + +- In the **Data** section of the configuration file, you can specify paths and other important data-related hyperparameters. + +```yaml +train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 # 224 for our VidTwin model + input_width: INPUT_WIDTH_1 # 224 for our VidTwin model + sample_num_frames: NUM_FRAMES_1 # set to 16 for our VidTwin model + sample_fps: SAMPLE_FPS_1 # sample fps for training data, 8 for VidTwin model +validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 # 224 for our VidTwin model + input_width: INPUT_WIDTH_2 # 224 for our VidTwin model + sample_num_frames: NUM_FRAMES_2 # set to 16 for our VidTwin model + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data +``` + +2. Run the following command to start training: +```bash +python main.py -b CONFIG --logdir LOGDIR + +# You can also use `torchrun` to start the training code. +``` +Training logs and checkpoints are saved in `LOGDIR`. + +It is recommended to use [Weights & Biases](https://wandb.ai/site) as the data visualization tool ([TensorBoard](https://www.tensorflow.org/tensorboard) by default). Use `wandb login` to log in first, and then run: +``` +python main.py -b CONFIG --logdir LOGDIR --wandb --wandb_entity ENTITY --wandb_project PROJECT +``` + +## Inference + + +### Easy Usage +We provide the following example for a quick usage of our models. After downloaded the checkpoint from our [Huggingface page](https://huggingface.co/microsoft/vidtwin), just provide the path of the configuration file `cfg_path` and checkpoint file `ckpt_path` to the script. +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +cfg_path = "configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml" +ckpt_path = "checkpoints/vidtwin_structure_7_7_8_dynamics_7_8.ckpt" + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to(device).eval() +# random input +num_frames = 16 +x_input = (torch.rand(1, 3, num_frames, 224, 224) * 2 - 1).to(device) # [B, C, T, H, W], range -1~1 +# model forward +_, x_recon, *_ = model(x_input) +assert x_input.shape == x_recon.shape +``` + +### Reconstruct an Input Video +```bash +python vidtwin/scripts/inference_reconstruct.py --config CONFIG --ckpt CKPT --input_video_path VIDEO_PATH --num_frames_per_batch NUM_FRAMES_PER_BATCH --input_height 224 --input_width 224 --sample_fps 25 --output_video_dir OUTPUT_DIR +``` +- Specify `VIDEO_PATH` to the path of your test video. We provide an example video in `assets/example.mp4`. +- Set `NUM_FRAMES_PER_BATCH` to `16. +- The reconstructed video is saved in `OUTPUT_DIR`. + +### Performance Evaluation +We also provide a manuscript `scripts/inference_evaluate.py` to evaluate the video reconstruction performance in PSNR, SSIM and LPIPS. + +1. Put all of your test videos under `DATA_DIR`. +2. Run the following command, and all `.mp4` videos under `DATA_DIR` will be tested: +```bash +python vidtwin/scripts/inference_evaluate.py --config CONFIG --ckpt CKPT --data_dir DATA_DIR --num_frames_per_batch NUM_FRAMES_PER_BATCH --input_height 224 --input_width 224 --sample_fps 25 +``` +(Optional) If you only want to test certain videos under `DATA_DIR`, you need to prepare a `.csv` meta file +to indicate the video files to be tested (refer to [Data Preparation](#data-preparation)). And add `--meta_path META_PATH` to the above command to specify the path to the `.csv` meta file. + + + +### Cross-reenactment Reconstruction + +For VidTwin model, we conduct a cross-reenactment experiment in which we combine the *Structure Latent* from one video, $A$, with the *Dynamics Latent* from another video, $B$, to observe the generated output from the decoder, i.e., generating $\mathcal{D}(u^A_{\boldsymbol{S}}, u^B_{\boldsymbol{D}})$. + +To facilitate this experiment, we provide the script `vidtwin/scripts/inference_vidtwin_cross_reconstruct.py`. This script follows a similar usage method to `vidtwin/scripts/inference_reconstruct.py` with the addition of two new arguments: `--input_video_path_structure` and `--input_video_path_dynamics`, which allow you to specify the videos for structure and dynamics information, respectively. + +## BibTeX +If you find our project helpful to your research, please consider starring this repository🌟 and citing our paper. +```bibtex +@article{wang2024vidtwin, + title={VidTwin: Video VAE with Decoupled Structure and Dynamics}, + author={Wang, Yuchi and Guo, Junliang and Xie, Xinyi and He, Tianyu and Sun, Xu and Bian, Jiang}, + year={2024}, + journal={arXiv preprint arXiv:2412.17726}, +} +``` diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/models/vidtwin_ae.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/models/vidtwin_ae.py new file mode 100644 index 0000000000000000000000000000000000000000..22daf6cae27b2731021490adf8a4b8cb41dc5a4b --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/models/vidtwin_ae.py @@ -0,0 +1,1604 @@ +import os +import re +import math +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union + +import lightning.pytorch as pl +import torch +import einops +from omegaconf import ListConfig +from packaging import version +from safetensors.torch import load_file as load_safetensors + +from torch.optim.lr_scheduler import _LRScheduler, LambdaLR, StepLR +from vidtok.modules.util import default, instantiate_from_config, print0, get_valid_paths +from vidtok.modules.util import compute_psnr, compute_ssim +from vidtok.models.autoencoder import AbstractAutoencoder +import numpy as np +from torch import nn +from einops import rearrange, repeat +import transformers + + +class VidAutoEncoderQformerBase(AbstractAutoencoder): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def init_from_ckpt( + self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple() + ) -> None: + if path.endswith("ckpt"): + # sd = torch.load(path, map_location="cpu")["state_dict"] + ckpt = torch.load(path, map_location="cpu") + if "state_dict" in ckpt: + sd = ckpt["state_dict"] + else: + sd = ckpt + elif path.endswith("safetensors"): + sd = load_safetensors(path) + else: + raise NotImplementedError + + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Deleting key {k} from state_dict.") + del sd[k] + + for k, tensor in sd.items(): + sd[k] = tensor.to(torch.float64) + + missing, unexpected = self.load_state_dict(sd, strict=False) + print0( + f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Unexpected Keys: {unexpected}") + + def get_input(self, batch: Dict) -> torch.Tensor: + # assuming unified data format, dataloader returns a dict. + # image tensors should be scaled to -1 ... 1 and in channels-first format (e.g., bchw instead if bhwc) + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(self.encoder.parameters()) + + list(self.decoder.parameters()) + + list(self.get_disentangle_params()) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) # e.g., discriminator + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + # See https://github.com/Lightning-AI/pytorch-lightning/issues/17801 and https://lightning.ai/docs/pytorch/stable/common/optimization.html for the reason of this change + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + z, xrec, regularization_log, *_ = self(x) + opt_g, opt_d = self.optimizers() + sch1, sch2 = self.lr_schedulers() + + + # autoencode loss + self.toggle_optimizer(opt_g) + # adversarial loss is binary cross-entropy + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + opt_g.step() + sch1.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + # adversarial loss is binary cross-entropy + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + opt_d.step() + + sch2.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + z, xrec, regularization_log, *_ = self(x) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + + # evaluate the psnr and ssim + x = x.clamp(-1, 1) + xrec = xrec.clamp(-1, 1) + x = (x + 1) / 2 + xrec = (xrec + 1) / 2 + psnr = compute_psnr(xrec, x) + ssim = compute_ssim(xrec, x) + + self.log(f"val{postfix}/psnr", psnr, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log(f"val{postfix}/ssim", ssim, prog_bar=True, logger=True, on_step=True, on_epoch=True) + return log_dict_ae + + def configure_optimizers(self): + if self.trainable_ae_params is None: + ae_params = self.get_autoencoder_params() + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable autoencoder parameters: {len(ae_params):,}") + else: + ae_params, num_ae_params = self.get_param_groups( + self.trainable_ae_params, self.ae_optimizer_args + ) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable autoencoder parameters: {num_ae_params:,}") + if self.trainable_disc_params is None: + disc_params = self.get_discriminator_params() + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable discriminator parameters: {len(disc_params):,}") + else: + disc_params, num_disc_params = self.get_param_groups( + self.trainable_disc_params, self.disc_optimizer_args + ) + print0( + f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable discriminator parameters: {num_disc_params:,}" + ) + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + + if len(disc_params) > 0: + opt_disc = self.instantiate_optimizer_from_config( + disc_params, self.learning_rate, self.optimizer_config + ) + + lr_freq1 = 1 + lr_freq2 = 1 + if not self.use_scheduler_g: + total_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + scheduler1 = ConstantWarmupScheduler(opt_ae, warmup_steps=500, total_steps=total_steps) + else: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use generator lr scheduler: {self.lr_scheduler_config_g.target}") + lr_freq1 = self.lr_scheduler_config_g.params.frequency if hasattr(self.lr_scheduler_config_g.params, 'frequency') else 1 + max_decay_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler max_decay_steps: {max_decay_steps}") + if 'inverse_sqrt' in self.lr_scheduler_config_g.target: + scheduler1 = transformers.get_inverse_sqrt_schedule(optimizer=opt_ae, num_warmup_steps=self.lr_scheduler_config_g.params.num_warmup_steps) + elif 'LambdaWarmUpCosineScheduler' in self.lr_scheduler_config_g.target: + scheduler1 = LambdaWarmUpCosineScheduler(optimizer=opt_ae, total_steps=max_decay_steps, **self.lr_scheduler_config_g.params) + elif 'LinearWarmupScheduler' in self.lr_scheduler_config_g.target: + scheduler1 = LinearWarmupScheduler(opt_ae, total_steps=max_decay_steps, **self.lr_scheduler_config_g.params) + else: + scheduler1 = instantiate_lrscheduler_from_config(opt_ae, self.lr_scheduler_config_g, total_steps=max_decay_steps) + + if not self.use_scheduler_d: + total_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + scheduler2 = ConstantWarmupScheduler(opt_disc, warmup_steps=500, total_steps=total_steps) + else: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler: {self.lr_scheduler_config_d.target}") + lr_freq2 = self.lr_scheduler_config_d.params.frequency if hasattr(self.lr_scheduler_config_d.params, 'frequency') else 1 + max_decay_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler max_decay_steps: {max_decay_steps}") + if 'inverse_sqrt' in self.lr_scheduler_config_d.target: + scheduler2 = transformers.get_inverse_sqrt_schedule(optimizer=opt_disc, num_warmup_steps=self.lr_scheduler_config_d.params.num_warmup_steps) + elif 'LambdaWarmUpCosineScheduler' in self.lr_scheduler_config_d.target: + scheduler2 = LambdaWarmUpCosineScheduler(optimizer=opt_disc, total_steps=max_decay_steps, **self.lr_scheduler_config_d.params) + elif 'LinearWarmupScheduler' in self.lr_scheduler_config_d.target: + scheduler2 = LinearWarmupScheduler(opt_disc, total_steps=max_decay_steps, **self.lr_scheduler_config_d.params) + else: + scheduler2 = instantiate_lrscheduler_from_config(opt_disc, self.lr_scheduler_config_d, total_steps=max_decay_steps) + + + lr_scheduler_config1 = { + "optimizer": opt_ae, + "lr_scheduler": { + "scheduler": scheduler1, + "name": "lr_generator", + "interval": "step", + "frequency": lr_freq1, + } + } + lr_scheduler_config2 = { + "optimizer": opt_disc, + "lr_scheduler": { + "scheduler": scheduler2, + "name": "lr_discriminator", + "interval": "step", + "frequency": lr_freq2, + } + } + return (lr_scheduler_config1, lr_scheduler_config2) + + @torch.no_grad() + def log_images(self, batch: Dict, **kwargs) -> Dict: # called at ImageLoggerCallback.log_img() + log = dict() + x = self.get_input(batch) + _, xrec, *_ = self(x) + log["inputs"] = x + log["reconstructions"] = xrec + return log + + +class VidAutoEncoderQformer(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + height_qformer_config: Dict, + width_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.hight_qformer = instantiate_from_config(height_qformer_config) + self.width_qformer = instantiate_from_config(width_qformer_config) + + + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + # (bhw, f, c) -> (bhw, f',c') + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.height_emb = nn.Sequential( + nn.Linear(height_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(height_qformer_config.params.num_query_tokens, self.patch_nums[1], 1), + nn.ReLU(), + ) + + self.width_emb = nn.Sequential( + nn.Linear(width_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(width_qformer_config.params.num_query_tokens, self.patch_nums[2], 1), + nn.ReLU(), + ) + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.hight_qformer.parameters()) + + list(self.width_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.height_emb.parameters()) + + list(self.width_emb.parameters()) + ) + + return params + + + def decode(self, z, z_content, z_motion_x, z_motion_y) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h', w', c_q) + z_motion_x: shape (b, f, h_q, w', c_q) + z_motion_y: shape (b, f, h', w_q, c_q) + ''' + z_content = rearrange(z_content, 'B F H W C -> (B H W) F C') + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + z_motion_x = rearrange(z_motion_x, 'B F H W C -> (B F W) H C') + vx = rearrange(self.height_emb(z_motion_x), '(B F W) H C -> B C F H W', F=z.size(2), W=z.size(4)) + z_motion_y = rearrange(z_motion_y, 'B F H W C -> (B F H) W C') + vy = rearrange(self.width_emb(z_motion_y), '(B F H) W C -> B C F H W', F=z.size(2), H=z.size(3)) + c_plus_m = vt + vx + vy # shape (b, c', f, h', w') + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + z_content = self.temporal_qformer(rearrange(z, 'B C F H W -> (B H W) F C')) + z_content = rearrange(z_content, '(B H W) F C -> B F H W C', H=z.size(3), W=z.size(4)) # compressed in the temporal dimension + z_motion_x = self.hight_qformer(rearrange(z, 'B C F H W -> (B F W) H C')) + z_motion_x = rearrange(z_motion_x, '(B F W) H C -> B F H W C', F=z.size(2), W=z.size(4)) # compressed in the height dimension + z_motion_y = self.width_qformer(rearrange(z, 'B C F H W -> (B F H) W C')) + z_motion_y = rearrange(z_motion_y, '(B F H) W C -> B F H W C', F=z.size(2), H=z.size(3)) # compressed in the width dimension + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, None + return z, z_content, z_motion_x, z_motion_y + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class VidAutoEncoderQformerCompact(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + space_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + retain_num_frames: bool = True, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + repeat_for_decoder: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.space_qformer = instantiate_from_config(space_qformer_config) + + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + self.repeat_for_decoder = repeat_for_decoder + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.down_channel_temp = nn.Linear(self.hidden_dim, self.temporal_down_dim) + self.up_channel_temp = nn.Linear(self.temporal_down_dim, self.hidden_dim) + self.pre_temporal_qformer = nn.Sequential( + nn.Linear(self.temporal_down_dim * self.patch_nums[1] * self.patch_nums[2], self.hidden_dim), + nn.ReLU(), + ) + self.retain_num_frames = retain_num_frames + if not self.retain_num_frames: + self.pre_spatial_qformer = nn.Sequential( + nn.Linear(self.hidden_dim * self.patch_nums[0], 2 * self.hidden_dim), + nn.ReLU(), + nn.Linear(2 * self.hidden_dim, self.hidden_dim), + nn.ReLU(), + ) + if self.repeat_for_decoder: + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + # (bhw, f, c) -> (bhw, f',c') + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.temporal_down_dim * self.patch_nums[1] * self.patch_nums[2]), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + if retain_num_frames: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim * self.patch_nums[0]), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.space_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.spatial_emb.parameters()) + + list(self.pre_temporal_qformer.parameters()) + + list(self.down_channel_temp.parameters()) + ) + if not self.retain_num_frames: + params += list(self.pre_spatial_qformer.parameters()) + if not self.repeat_for_decoder: + params += list(self.up_channel_temp.parameters()) + return params + + def decode(self, z, z_content, z_motion, only_part=None) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + if self.repeat_for_decoder: + z_content = repeat(z_content, 'B F C -> B f F C', f=z.size(2)) + vt = rearrange(self.cont_emb(rearrange(z_content, 'B F A d -> (B F) A d')), '(B f) (H W) C -> B C f H W', H=z.size(3), W=z.size(4), f=z.size(2)) + else: + vt = rearrange(self.cont_emb(z_content), 'B F (C H W) -> B C F H W', H=z.size(3), W=z.size(4)) + vt = self.up_channel_temp(vt.transpose(1, -1)).transpose(1, -1) + if self.retain_num_frames: + vs = rearrange(self.spatial_emb(rearrange(z_motion, 'B F X Y -> (B F) X Y')), '(B F) (H W) C -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + else: + vs = rearrange(self.spatial_emb(z_motion), 'B (H W) (F C) -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + + if self.partial_content_motion == 'content': + c_plus_m = vt + elif self.partial_content_motion == 'motion': + c_plus_m = vs + else: + c_plus_m = vt + vs # shape (b, c', f, h', w') + if only_part == 'content': + c_plus_m = vt + elif only_part == 'motion': + c_plus_m = vs + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = self.pre_temporal_qformer(rearrange(self.down_channel_temp(rearrange(z_shuffled, 'B C F H W -> B F H W C')), 'B F H W C -> B F (H W C)')) + else: + pre_qformer = self.pre_temporal_qformer(rearrange(self.down_channel_temp(rearrange(z, 'B C F H W -> B F H W C')), 'B F H W C -> B F (H W C)')) + z_content = self.temporal_qformer(pre_qformer) # shape (b, f_q, d_q) + layer_norm_content = nn.LayerNorm(z_content.size(-1)).to(z_content.device) + z_content = layer_norm_content(z_content) + + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z, 'B C F H W -> B (H W) (F C)'))) + layer_norm_motion = nn.LayerNorm(z_motion.size(-1)).to(z_motion.device) + z_motion = layer_norm_motion(z_motion) + if return_reg_log: + return z, z_content, z_motion, None + return z, z_content, z_motion + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion + + +class VidAutoEncoderQformerCompactSym(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + space_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + retain_num_frames: bool = True, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + expect_ch: int = 4, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.space_qformer = instantiate_from_config(space_qformer_config) + + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.retain_num_frames = retain_num_frames + if not self.retain_num_frames: + self.pre_spatial_qformer = nn.Sequential( + nn.Linear(self.hidden_dim * self.patch_nums[0], 2 * self.hidden_dim), + nn.ReLU(), + nn.Linear(2 * self.hidden_dim, self.hidden_dim), + nn.ReLU(), + ) + + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + if retain_num_frames: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim * self.patch_nums[0]), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.space_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.spatial_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + ) + if not self.retain_num_frames: + params += list(self.pre_spatial_qformer.parameters()) + + return params + + def decode(self, z, z_content, z_motion) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + if self.retain_num_frames: + vs = rearrange(self.spatial_emb(rearrange(z_motion, 'B F X Y -> (B F) X Y')), '(B F) (H W) C -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + else: + vs = rearrange(self.spatial_emb(z_motion), 'B (H W) (F C) -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + + if self.partial_content_motion == 'content': + c_plus_m = vt + elif self.partial_content_motion == 'motion': + c_plus_m = vs + else: + c_plus_m = vt + vs # shape (b, c', f, h', w') + + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z, 'B C F H W -> B (H W) (F C)'))) + if return_reg_log: + # return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion, None + return z, z_content, z_motion + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion + + +class VidAutoEncoderQformerCompactSymDis(VidAutoEncoderQformerCompactSym): + + def __init__( + self, + *args, + shuffle_content_ratio: float = 0.5, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.shuffle_content_ratio = shuffle_content_ratio + + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + # shuffle the content frames + x_shuffled = x.clone() + for i in range(x.size(0)): + randn_num = torch.rand(1) + if randn_num < self.shuffle_content_ratio: + idx = torch.randperm(x.size(2)) + x_shuffled[i] = x[i, :, idx, :, :] + x = torch.cat([x, x_shuffled], dim=0) + z = self.encoder(x) # shape (2b, c', f, h', w') + z_orig, z_shuffled = z.chunk(2, dim=0) + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z_orig, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z_orig, 'B C F H W -> B (H W) (F C)'))) + if return_reg_log: + # return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion, None + return z, z_content, z_motion + +class VidAutoEncoderQformerCompactSymVid(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + motion_num_blocks: int = 2, + expect_ch: int = 4, + d_dim: int = 16, + # space_qformer_config: Dict, + downsample_motion: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.d_dim = d_dim + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + self.motion_emb = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim), + nn.ReLU() + ) + self.motion_head = nn.Conv2d(self.hidden_dim, self.d_dim, kernel_size=3, stride=1, padding=1) + + self.downsample_motion = downsample_motion + if self.downsample_motion: + motion_downsample_blocks = [] + curr_resol = self.patch_nums[1] + for i in range(motion_num_blocks): + motion_downsample_blocks.append(nn.Conv2d(self.hidden_dim, self.hidden_dim, kernel_size=3, stride=2, padding=1)) + motion_downsample_blocks.append(nn.ReLU()) + curr_resol = (curr_resol + 1) // 2 + self.downsample_motion_module = nn.Sequential(*motion_downsample_blocks) + self.up_motion = nn.Sequential(nn.Linear(curr_resol, self.patch_nums[1]), + nn.ReLU(), + nn.Linear(self.patch_nums[1], self.patch_nums[1]), + nn.ReLU()) + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + list(self.motion_emb.parameters()) + + list(self.motion_head.parameters()) + + ) + if self.downsample_motion: + params += list(self.downsample_motion_module.parameters()) + params += list(self.up_motion.parameters()) + + return params + + def decode(self, z, z_content, z_motion_x, z_motion_y) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + vx = rearrange(self.motion_emb(rearrange(z_motion_x, 'B D F W -> B F W D')), 'B F W C -> B C F W') # shape (b, c', f, w') + vy = rearrange(self.motion_emb(rearrange(z_motion_y, 'B D F H -> B F H D')), 'B F H C -> B C F H') # shape (b, c', f, h') + if self.downsample_motion: + vx = self.up_motion(vx) + vy = self.up_motion(vy) + vx = repeat(vx, 'b c f w -> b c f h w', h=z.size(3)) + vy = repeat(vy, 'b c f h -> b c f h w', w=z.size(4)) + + c_plus_m = vt + vx + vy # shape (b, c', f, h', w') + + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + z_motion_x, z_motion_y = self.get_motion_latent(z) + + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, None + return z, z_content, z_motion_x, z_motion_y + + def get_motion_latent(self, z: torch.Tensor) -> torch.Tensor: + f = z.size(2) + if self.downsample_motion: + z = self.downsample_motion_module(rearrange(z, 'B C F H W -> (B F) C H W')) + z = rearrange(z, '(B F) C H W -> B C F H W', F=f) + ux = torch.mean(z, dim=-2) # shape (b, c', f, w') + uy = torch.mean(z, dim=-1) # shape (b, c', f, h') + zx = self.motion_head(ux) # shape (b, d, f, w') + zy = self.motion_head(uy) # shape (b, d, f, h') + return zx, zy + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class VidAutoEncoderQformerCompactSymVidVAE(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + motion_num_blocks: int = 2, + expect_ch: int = 4, + d_dim: int = 16, + downsample_motion: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.d_dim = d_dim + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, 2*expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + self.motion_emb = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim), + nn.ReLU() + ) + self.motion_head = nn.Conv2d(self.hidden_dim, 2*self.d_dim, kernel_size=3, stride=1, padding=1) + + self.downsample_motion = downsample_motion + if self.downsample_motion: + motion_downsample_blocks = [] + curr_resol = self.patch_nums[1] + for i in range(motion_num_blocks): + motion_downsample_blocks.append(nn.Conv2d(self.hidden_dim, self.hidden_dim, kernel_size=3, stride=2, padding=1)) + motion_downsample_blocks.append(nn.ReLU()) + curr_resol = (curr_resol + 1) // 2 + self.downsample_motion_module = nn.Sequential(*motion_downsample_blocks) + self.up_motion = nn.Sequential(nn.Linear(curr_resol, self.patch_nums[1]), + nn.ReLU(), + nn.Linear(self.patch_nums[1], self.patch_nums[1]), + nn.ReLU()) + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + list(self.motion_emb.parameters()) + + list(self.motion_head.parameters()) + + ) + if self.downsample_motion: + params += list(self.downsample_motion_module.parameters()) + params += list(self.up_motion.parameters()) + + return params + + + def decode(self, z, z_content, z_motion_x, z_motion_y, only_part=None) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + vx = rearrange(self.motion_emb(rearrange(z_motion_x, 'B D F W -> B F W D')), 'B F W C -> B C F W') # shape (b, c', f, w') + vy = rearrange(self.motion_emb(rearrange(z_motion_y, 'B D F H -> B F H D')), 'B F H C -> B C F H') # shape (b, c', f, h') + if self.downsample_motion: + vx = self.up_motion(vx) + vy = self.up_motion(vy) + vx = repeat(vx, 'b c f w -> b c f h w', h=z.size(3)) + vy = repeat(vy, 'b c f h -> b c f h w', w=z.size(4)) + + if only_part == 'content': + c_plus_m = vt + elif only_part == 'motion': + c_plus_m = vx + vy + else: + c_plus_m = vt + vx + vy + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B C F H W', F=z_content.size(1)) + z_content, content_reglog = self.regularization(z_content) + z_content = rearrange(z_content, 'B C F H W -> B F H W C') + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + z_motion_x, z_motion_y = self.get_motion_latent(z) + z_motion_x, z_motion_x_log = self.regularization(z_motion_x) + z_motion_y, z_motion_y_log = self.regularization(z_motion_y) + reg_log = {} + reg_log['kl_loss'] = content_reglog['kl_loss'] + z_motion_x_log['kl_loss'] + z_motion_y_log['kl_loss'] + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion_x, z_motion_y + + def get_motion_latent(self, z: torch.Tensor) -> torch.Tensor: + f = z.size(2) + if self.downsample_motion: + z = self.downsample_motion_module(rearrange(z, 'B C F H W -> (B F) C H W')) + z = rearrange(z, '(B F) C H W -> B C F H W', F=f) + ux = torch.mean(z, dim=-2) # shape (b, c', f, w') + uy = torch.mean(z, dim=-1) # shape (b, c', f, h') + zx = self.motion_head(ux) # shape (b, d, f, w') + zy = self.motion_head(uy) # shape (b, d, f, h') + + return zx, zy + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class DepthToSpace(nn.Module): + def __init__(self, block_size): + super().__init__() + self.bs, self.bt = block_size + + def forward(self, x): + B, C, N, H, W = x.size() + x = x.view(B, self.bt, self.bs, self.bs, C // ((self.bs ** 2) * self.bt), N, H, W) # (B, bs, bs, bs, C//bs^2, N, H, W) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3).contiguous() # (B, C//bs^3, N, bs, H, bs, W, bs) + x = x.view(B, C // ((self.bs ** 2) * self.bt), N * self.bt, H * self.bs, W * self.bs) # (B, C//bs^3, N * bs, H * bs, W * bs) + # remove the first frame + if self.bt > 1: + x = x[:, :, 1:, :, :] + else: + x = x + return x + + +from torch.optim.lr_scheduler import _LRScheduler + + +class LinearWarmupScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_steps, total_steps, target_lr, last_epoch=-1): + self.warmup_steps = warmup_steps + self.target_lr = target_lr + self.total_steps = total_steps + super(LinearWarmupScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_steps: + # Linear warm-up + return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs] + elif self.last_epoch < self.total_steps: + # Constant learning rate + return [base_lr * (1 - self.last_epoch / self.total_steps) for base_lr in self.base_lrs] + else: + return self.base_lrs + +class ConstantWarmupScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_steps, total_steps, last_epoch=-1): + self.warmup_steps = warmup_steps + self.total_steps = total_steps + # self.base_lrs = lr_max + super(ConstantWarmupScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_steps: + # Linear warm-up + return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs] + elif self.last_epoch < self.total_steps: + # Constant learning rate + return self.base_lrs + +class LambdaWarmUpCosineScheduler(_LRScheduler): + """ + note: use with a base_lr of 1.0 + """ + def __init__(self, optimizer, lr_min, lr_max, lr_start, total_steps, warmup_rate = -1, verbosity_interval=0, last_epoch=-1, warmup_steps=-1): + self.verbosity_interval = verbosity_interval + if warmup_rate >= 0: + self.lr_warm_up_steps = total_steps * warmup_rate + elif warmup_steps >= 0: + self.lr_warm_up_steps = warmup_steps + else: + self.lr_warm_up_steps = 0 + self.lr_start = lr_start + self.lr_min = lr_min + self.lr_max = lr_max + self.lr_max_decay_steps = total_steps + super(LambdaWarmUpCosineScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.verbosity_interval > 0: + if self.last_epoch % self.verbosity_interval == 0: print(f"current step: {self.last_epoch}, recent lr-multiplier: {self.last_lr}") + if self.last_epoch < self.lr_warm_up_steps: + lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * self.last_epoch + self.lr_start + self.last_lr = lr + return [lr] + else: + t = (self.last_epoch - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) + t = min(t, 1.0) + lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( + 1 + np.cos(t * np.pi)) # a + 0.5 * (b - a) * (1 + cos(pi * t)), where t \in [0, 1], so the lr will be in [a, b] + self.last_lr = lr + return [lr] + + + +def instantiate_lrscheduler_from_config(optimizer, config, name='main-LR'): + """ + Instantiate a learning rate scheduler from a config dict. + If use timm, must add the following codes to the LightningModule: + + def lr_scheduler_step(self, scheduler, metric): + if 'timm.scheduler' in self.lr_scheduler_config.target: + scheduler.step(epoch=self.current_epoch) + else: + if metric is None: + scheduler.step() + else: + scheduler.step(metric) + """ + assert 'target' in config, 'Expected key `target` to instantiate.' + if ('torch.optim' in config.target) or ('timm.scheduler' in config.target): + scheduler = get_obj_from_str(config["target"])(optimizer, **config.get("params", dict())) + lr_scheduler = { + 'scheduler': scheduler, + 'name': name + } + else: + scheduler_init = instantiate_from_config(config) + scheduler = LambdaLR(optimizer, lr_lambda=scheduler_init.schedule) + lr_scheduler = { + 'scheduler': LambdaLR(optimizer, lr_lambda=scheduler_init.schedule), + 'name': name, + 'interval': 'step', + 'frequency': 1 + } + return scheduler + + + diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/modules/qformer.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/modules/qformer.py new file mode 100644 index 0000000000000000000000000000000000000000..be08e95b15c2fa1b82a0d761547734834e6bf139 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/modules/qformer.py @@ -0,0 +1,654 @@ +# coding=utf-8 +"""PyTorch BLIP-2 model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from transformers.utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from transformers import Blip2QFormerConfig, Blip2PreTrainedModel + + +logger = logging.get_logger(__name__) + +class Blip2QFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +class Blip2QFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention) + self.output = Blip2QFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class Blip2QFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class Blip2QFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Blip2QFormerModel(Blip2PreTrainedModel): + """ + Querying Transformer (Q-Former), used in BLIP-2. + """ + + def __init__(self, config: Blip2QFormerConfig): + super().__init__(config) + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + query_embeds: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +from einops import repeat + +class MyQformerInterface(nn.Module): + def __init__(self, num_query_tokens=3, query_hidden_size=64, encoder_hidden_size=768, num_hidden_layers=6,intermediate_size=768, num_attention_heads=8): + super().__init__() + self.config = Blip2QFormerConfig(hidden_size=query_hidden_size, encoder_hidden_size=encoder_hidden_size, num_hidden_layers=num_hidden_layers, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads) + self.qformer = Blip2QFormerModel(self.config) + self.query_embeds = nn.Parameter(torch.randn(num_query_tokens, query_hidden_size)) + + def forward(self, encoder_hidden_states): + query_batch = repeat(self.query_embeds, 'q d -> b q d', b=encoder_hidden_states.shape[0]) + output = self.qformer(query_embeds=query_batch, encoder_hidden_states=encoder_hidden_states) + return output.last_hidden_state + + +if __name__ == '__main__': + a_former = MyQformerInterface(10, 768, 768) + print('initialized query embeddings', a_former.query_embeds) + test_encoder_hidden_states = torch.randn(2, 16, 768) * 100 + + for name, param in a_former.named_parameters(): + print(name, param.shape) + optim = torch.optim.Adam(a_former.parameters(), lr=0.01) + for i in range(20): + print('running forward pass', i) + output = a_former(test_encoder_hidden_states) + print('loss', output.sum()) + output.sum().backward() + optim.step() + optim.zero_grad() + + print('query embeddings after 10 forward passes', a_former.query_embeds) + diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/modules/st_transformer.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/modules/st_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ae93682b034377a6fa7c36a55f6e714462bab28b --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/modules/st_transformer.py @@ -0,0 +1,804 @@ +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from timm.models.layers import DropPath +from timm.models.vision_transformer import Mlp +import torch.nn.functional as F +approx_gelu = lambda: nn.GELU(approximate="tanh") + +from collections.abc import Iterable + +from torch.utils.checkpoint import checkpoint, checkpoint_sequential +from pathlib import Path +from omegaconf import ListConfig +from torch.cuda.amp import autocast + +from einops import rearrange, repeat, reduce, pack, unpack +import pickle + +def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): + assert isinstance(model, nn.Module) + + def set_attr(module): + module.grad_checkpointing = True + module.fp32_attention = use_fp32_attention + module.grad_checkpointing_step = gc_step + + model.apply(set_attr) + + +def auto_grad_checkpoint(module, *args, **kwargs): + if getattr(module, "grad_checkpointing", False): + if not isinstance(module, Iterable): + return checkpoint(module, *args, **kwargs) + gc_step = module[0].grad_checkpointing_step + return checkpoint_sequential(module, gc_step, *args, **kwargs) + return module(*args, **kwargs) + + +def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool): + if use_kernel: + try: + from apex.normalization import FusedLayerNorm + + return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps) + except ImportError: + raise RuntimeError("FusedLayerNorm not available. Please install apex.") + else: + return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine) + + +def t2i_modulate(x, shift, scale): + return x * (1 + scale) + shift + + +class T2IFinalLayer(nn.Module): + """ + The final layer of PixArt. + """ + + def __init__(self, hidden_size, num_patch, out_channels): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) + self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5) + self.out_channels = out_channels + + def forward(self, x): + shift, scale = (self.scale_shift_table[None]).chunk(2, dim=1) + x = t2i_modulate(self.norm_final(x), shift, scale) + x = self.linear(x) + return x + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + enable_flashattn: bool = False, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.enable_flashattn = enable_flashattn + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor, causal: bool) -> torch.Tensor: + B, N, C = x.shape + qkv = self.qkv(x) + qkv_shape = (B, N, 3, self.num_heads, self.head_dim) + if self.enable_flashattn: + qkv_permute_shape = (2, 0, 1, 3, 4) + else: + qkv_permute_shape = (2, 0, 3, 1, 4) + qkv = qkv.view(qkv_shape).permute(qkv_permute_shape) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + if self.enable_flashattn: + from flash_attn import flash_attn_func + + x = flash_attn_func( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + softmax_scale=self.scale, + causal=causal, + ) + else: + # raise NotImplementedError + dtype = q.dtype + q = q * self.scale + attn = q @ k.transpose(-2, -1) # translate attn to float32 + attn = attn.to(torch.float32) + attn = attn.softmax(dim=-1) + attn = attn.to(dtype) # cast back attn to original dtype + attn = self.attn_drop(attn) + x = attn @ v + + x_output_shape = (B, N, C) + if not self.enable_flashattn: + x = x.transpose(1, 2) + x = x.reshape(x_output_shape) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class GroupAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + enable_flashattn: bool = False, + group_size: int = 4, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.enable_flashattn = enable_flashattn + self.group_size = group_size + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor, causal: bool) -> torch.Tensor: + B, N, C = x.shape + assert N % self.group_size == 0, "sequence length should be divisible by group_size" + G = N // self.group_size + if self.enable_flashattn: + qkv_permute_shape = (2, 0, 1, 3, 4) + else: + qkv_permute_shape = (2, 0, 3, 1, 4) + qkv = self.qkv(x).view(B, N, 3, self.num_heads, self.head_dim).permute(qkv_permute_shape) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + + + if self.enable_flashattn: + # reshape to (B, G, 4, H, D) + q = q.view(B * G, self.group_size, self.num_heads, self.head_dim) + k = k.view(B * G, self.group_size, self.num_heads, self.head_dim) + v = v.view(B * G, self.group_size, self.num_heads, self.head_dim) + from flash_attn import flash_attn_func + + # modify flash_attn_func to support the new shape + x = flash_attn_func( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + softmax_scale=self.scale, + causal=causal, + ).reshape(B, N, C) + else: + q = rearrange(q, "B H S D -> (B G) H N D", G=G) + k = rearrange(k, "B H S D -> (B G) H N D", G=G) + v = rearrange(v, "B H S D -> (B G) H N D", G=G) + q = q * self.scale + attn = (q @ k.transpose(-2, -1)).softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v) + x = rearrange(x, "(B G) H N D -> B S (H D)", G=G, S=N) + + x = self.proj(x) + x = self.proj_drop(x) + return x + +class PatchEmbed3D(nn.Module): + """Video to Patch Embedding. + + Args: + patch_size (int): Patch token size. Default: (2,4,4). + in_chans (int): Number of input video channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, + patch_size=(2, 4, 4), + in_chans=3, + embed_dim=96, + norm_layer=None, + flatten=True, + ): + super().__init__() + self.patch_size = patch_size + self.flatten = flatten + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, D, H, W = x.size() + if W % self.patch_size[2] != 0: + x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) + if H % self.patch_size[1] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) + if D % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) + + x = self.proj(x) # (B 768, 16, 14, 14) patchify, for each patch, we use 768 vector to represent it + if self.norm is not None: + D, Wh, Ww = x.size(2), x.size(3), x.size(4) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC + return x + + + +class STBlock(nn.Module): + def __init__( + self, + hidden_size, + num_heads, + d_s=None, + d_t=None, + mlp_ratio=4.0, + drop_path=0.0, + enable_flashattn=True, + enable_layernorm_kernel=False, + temporal_casual=True, + no_temporal=False, + temporal_group = False, + group_size = 1 + # enable_sequence_parallelism=False, + ): + super().__init__() + self.hidden_size = hidden_size + self.enable_flashattn = enable_flashattn + + self.attn_cls = Attention + self.no_temporal = no_temporal + self.attn_group = GroupAttention + self.temporal_group = temporal_group + self.group_size = group_size + + self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) + self.attn = self.attn_cls( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=enable_flashattn, + ) + self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) + self.mlp = Mlp( + in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) + + # temporal attention + self.d_s = d_s + self.d_t = d_t + if self.temporal_group: + self.attn_temp = self.attn_group( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=self.enable_flashattn, + group_size=self.group_size, + ) + else: + self.attn_temp = self.attn_cls( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=self.enable_flashattn, + ) + self.temporal_casual = temporal_casual + + def forward(self, x, tpe=None): + + # B, T, S, C = x.shape[0] + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + ).chunk(6, dim=1) + x = x.to(torch.float64) + x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa).to(torch.float64) + + # spatial branch + x_s = rearrange(x_m, "B T S C -> (B T) S C", T=self.d_t, S=self.d_s) + # print(x_s.dtype) + # x_s = x_s.to(torch.float32) + x_s = x_s.to(torch.bfloat16) + x_s = self.attn(x_s, causal=False,).to(torch.bfloat16) + x_s = rearrange(x_s, "(B T) S C -> B T S C", T=self.d_t, S=self.d_s) + x = x + self.drop_path(gate_msa * x_s) + + if not self.no_temporal: + # temporal branch + x_t = rearrange(x, "B T S C -> (B S) T C", T=self.d_t, S=self.d_s) + + if tpe is not None: + x_t = x_t + tpe + x_t = x_t.to(torch.bfloat16) + x_t = self.attn_temp(x_t, causal=self.temporal_casual,) + x_t = rearrange(x_t, "(B S) T C -> B T S C", T=self.d_t, S=self.d_s).to(torch.bfloat16) + x = x + self.drop_path(gate_msa * x_t) + + # mlp + x = x.to(torch.float32) + x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) + x = x.to(torch.float32) + + return x + + +def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0): + pos = np.arange(0, length)[..., None] / scale + return get_1d_sincos_pos_embed_from_grid(embed_dim, pos) + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if not isinstance(grid_size, tuple): + grid_size = (grid_size, grid_size) + + grid_h = np.arange(grid_size[0], dtype=np.float32) / scale + grid_w = np.arange(grid_size[1], dtype=np.float32) / scale + if base_size is not None: + grid_h *= base_size / grid_size[0] + grid_w *= base_size / grid_size[1] + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token and extra_tokens > 0: + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def exists(v): + return v is not None + +def default(v, d): + return v if exists(v) else d + +def divisible_by(num, den): + return (num % den) == 0 + +def is_odd(n): + return not divisible_by(n, 2) + +def cast_tuple(t, length = 1): + if isinstance(t, ListConfig): + return tuple(t) + return t if isinstance(t, tuple) else ((t,) * length) + +class DepthToSpace(nn.Module): + + def __init__(self, block_size): + super().__init__() + self.bs, self.bt = block_size + + def forward(self, x): + B, C, N, H, W = x.size() + x = x.view(B, self.bt, self.bs, self.bs, C // ((self.bs ** 2) * self.bt), N, H, W) # (B, bs, bs, bs, C//bs^2, N, H, W) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3).contiguous() # (B, C//bs^3, N, bs, H, bs, W, bs) + x = x.view(B, C // ((self.bs ** 2) * self.bt), N * self.bt, H * self.bs, W * self.bs) # (B, C//bs^3, N * bs, H * bs, W * bs) + # remove the first frame + if self.bt > 1: + x = x[:, :, 1:, :, :] + else: + x = x + return x + + +# Swish Function +class Swish(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) + + + +class STTransformer(nn.Module): + def __init__( + self, + input_size=(1, 32, 32), + in_channels=4, + patch_size=(1, 2, 2), + hidden_size=1152, + depth=28, + num_heads=16, + mlp_ratio=4.0, + pred_sigma=False, + drop_path=0.0, + no_temporal_pos_emb=False, + space_scale=1.0, + time_scale=1.0, + freeze=None, + enable_flashattn=False, + enable_layernorm_kernel=False, + temporal_casual=True, + no_temporal=False, + temporal_group=False, + group_size=1, + ): + super().__init__() + self.pred_sigma = pred_sigma + self.in_channels = in_channels + self.out_channels = in_channels * 2 if pred_sigma else in_channels + self.hidden_size = hidden_size + self.patch_size = patch_size + self.input_size = input_size + num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) + self.num_patches = num_patches + self.num_temporal = input_size[0] // patch_size[0] + self.num_spatial = num_patches // self.num_temporal + self.num_heads = num_heads + self.no_temporal_pos_emb = no_temporal_pos_emb + self.depth = depth + self.mlp_ratio = mlp_ratio + self.enable_flashattn = enable_flashattn + self.enable_layernorm_kernel = enable_layernorm_kernel + self.space_scale = space_scale + self.time_scale = time_scale + self.temporal_casual = temporal_casual + self.temporal_group = temporal_group + self.group_size = group_size + + self.register_buffer("pos_embed", self.get_spatial_pos_embed()) + self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) + + self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) + self.no_temporal = no_temporal + + drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] + self.blocks = nn.ModuleList( + [ + STBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=self.mlp_ratio, + drop_path=drop_path[i], + enable_flashattn=self.enable_flashattn, + enable_layernorm_kernel=self.enable_layernorm_kernel, + d_t=self.num_temporal, + d_s=self.num_spatial, + temporal_casual=self.temporal_casual, + no_temporal=self.no_temporal, + temporal_group = self.temporal_group, + group_size = self.group_size + ) + for i in range(self.depth) + ] + ) + self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) + + # init model + self.initialize_weights() + self.initialize_temporal() + if freeze is not None: + assert freeze in ["not_temporal", "text"] + if freeze == "not_temporal": + self.freeze_not_temporal() + elif freeze == "text": + self.freeze_text() + + + + def forward(self, x): + """ + Forward pass of STDiT. + Args: + x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] + + Returns: + x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] + """ + + x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) + x = x + self.pos_embed + + with autocast(enabled=True): + for i, block in enumerate(self.blocks): + if i == 0: + tpe = self.pos_embed_temporal + else: + tpe = None + x = auto_grad_checkpoint(block, x, tpe) + + x = rearrange(x, "B T S C -> B (T S) C", T=self.num_temporal, S=self.num_spatial) + return x + + def unpatchify(self, x): + """ + Args: + x (torch.Tensor): of shape [B, N, C] + + Return: + x (torch.Tensor): of shape [B, C_out, T, H, W] + """ + + N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] + T_p, H_p, W_p = self.patch_size + x = rearrange( + x, + "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", + N_t=N_t, + N_h=N_h, + N_w=N_w, + T_p=T_p, + H_p=H_p, + W_p=W_p, + C_out=self.out_channels, + ) + return x + + def unpatchify_old(self, x): + c = self.out_channels + t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] + pt, ph, pw = self.patch_size + + x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) + x = rearrange(x, "n t h w r p q c -> n c t r h p w q") + imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) + return imgs + + def get_spatial_pos_embed(self, grid_size=None): + if grid_size is None: + grid_size = self.input_size[1:] + pos_embed = get_2d_sincos_pos_embed( + self.hidden_size, + (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), + scale=self.space_scale, + ) + pos_embed = torch.from_numpy(pos_embed).unsqueeze(0).requires_grad_(False) + return pos_embed + + def get_temporal_pos_embed(self): + pos_embed = get_1d_sincos_pos_embed( + self.hidden_size, + self.input_size[0] // self.patch_size[0], + scale=self.time_scale, + ) + pos_embed = torch.from_numpy(pos_embed).unsqueeze(0).requires_grad_(False) + return pos_embed + + def freeze_not_temporal(self): + for n, p in self.named_parameters(): + if "attn_temp" not in n: + p.requires_grad = False + + def freeze_text(self): + for n, p in self.named_parameters(): + if "cross_attn" in n: + p.requires_grad = False + + def initialize_temporal(self): + for block in self.blocks: + nn.init.constant_(block.attn_temp.proj.weight, 0) + nn.init.constant_(block.attn_temp.proj.bias, 0) + + def initialize_weights(self): + # Initialize transformer layers: + def _basic_init(module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + self.apply(_basic_init) + + w = self.x_embedder.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + nn.init.constant_(self.final_layer.linear.weight, 0) + nn.init.constant_(self.final_layer.linear.bias, 0) + +class STTEncoder(STTransformer): + def __init__(self, input_size=(1, 32, 32), in_channels=3, patch_size=(1, 2, 2), hidden_size=64, depth=12, num_heads=8, mlp_ratio=4, pred_sigma=False, drop_path=0, no_temporal_pos_emb=False, space_scale=1, time_scale=1, freeze=None, enable_flashattn=True, enable_layernorm_kernel=False, temporal_casual=True, no_temporal=False, temporal_group=False, group_size=1): + super().__init__(input_size, in_channels, patch_size, hidden_size, depth, num_heads, mlp_ratio, pred_sigma, drop_path, no_temporal_pos_emb, space_scale, time_scale, freeze, enable_flashattn, enable_layernorm_kernel, temporal_casual, no_temporal, temporal_group, group_size) + + def forward(self, x): + x = self.x_embedder(x) + y = super().forward(x) + y = rearrange(y, "B (T H W) C -> B C T H W", T=self.input_size[0], H=self.input_size[1]//self.patch_size[1], W=self.input_size[2]//self.patch_size[2]) + return y + + @property + def device(self): + return self.zero.device + + @classmethod + def init_and_load_from(cls, path, strict = True): + path = Path(path) + assert path.exists() + pkg = torch.load(str(path), map_location = 'cpu') + + assert 'config' in pkg, 'model configs were not found in this saved checkpoint' + + config = pickle.loads(pkg['config']) + tokenizer = cls(**config) + tokenizer.load(path, strict = strict) + return tokenizer + + def save(self, path, overwrite = True): + path = Path(path) + assert overwrite or not path.exists(), f'{str(path)} already exists' + + pkg = dict( + model_state_dict = self.state_dict(), + version =self.__version__, + config = self._configs + ) + + torch.save(pkg, str(path)) + + def load(self, path, strict = True): + path = Path(path) + assert path.exists() + + pkg = torch.load(str(path)) + state_dict = pkg.get('model_state_dict') + version = pkg.get('version') + + assert exists(state_dict) + + if exists(version): + print(f'loading checkpointed tokenizer from version {version}') + + self.load_state_dict(state_dict, strict = strict) + + + @torch.no_grad() + def tokenize(self, video): + self.eval() + return self.forward(video, return_codes = True) + + def debug_model(self, x, layer): + if torch.isnan(x).any(): + print('x has nan') + print(layer) + import sys + sys.exit() + + + +class STTDecoder(STTransformer): + def __init__(self, input_size=(1, 32, 32), in_channels=3, patch_size=(1, 2, 2), hidden_size=1152, depth=12, num_heads=16, mlp_ratio=4, pred_sigma=False, drop_path=0, no_temporal_pos_emb=False, space_scale=1, time_scale=1, freeze=None, enable_flashattn=True, enable_layernorm_kernel=False, temporal_casual=True, no_temporal=False): + super().__init__(input_size, in_channels, patch_size, hidden_size, depth, num_heads, mlp_ratio,pred_sigma, drop_path, no_temporal_pos_emb, space_scale, time_scale, freeze, enable_flashattn, enable_layernorm_kernel, temporal_casual, no_temporal) + self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) + + def forward(self, x): + x = rearrange(x, "B C T H W -> B (T H W) C") + y = super().forward(x) + y = self.final_layer(y) + y = self.unpatchify(y) + return y + + @property + def device(self): + return self.zero.device + + @classmethod + def init_and_load_from(cls, path, strict = True): + path = Path(path) + assert path.exists() + pkg = torch.load(str(path), map_location = 'cpu') + + assert 'config' in pkg, 'model configs were not found in this saved checkpoint' + + config = pickle.loads(pkg['config']) + tokenizer = cls(**config) + tokenizer.load(path, strict = strict) + return tokenizer + + def save(self, path, overwrite = True): + path = Path(path) + assert overwrite or not path.exists(), f'{str(path)} already exists' + + pkg = dict( + model_state_dict = self.state_dict(), + version = self.__version__, + config = self._configs + ) + + torch.save(pkg, str(path)) + + def load(self, path, strict = True): + path = Path(path) + assert path.exists() + + pkg = torch.load(str(path)) + state_dict = pkg.get('model_state_dict') + version = pkg.get('version') + + assert exists(state_dict) + + if exists(version): + print(f'loading checkpointed tokenizer from version {version}') + + self.load_state_dict(state_dict, strict = strict) + + + @torch.no_grad() + def tokenize(self, video): + self.eval() + return self.forward(video, return_codes = True) + + def debug_model(self, x, layer): + if torch.isnan(x).any(): + print('x has nan') + print(layer) + import sys + sys.exit() + + def get_last_layer(self): + return self.final_layer.linear.weight \ No newline at end of file diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_evaluate.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4a7b33dffb2a51c36294124a1ea7079ac70c97 --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_evaluate.py @@ -0,0 +1,208 @@ +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import warnings +warnings.filterwarnings("ignore") + +import time +import numpy as np +import torch +from contextlib import nullcontext +from pathlib import Path + +import decord +from einops import rearrange +from lightning.pytorch import seed_everything +from omegaconf import OmegaConf +from safetensors.torch import load_file as load_safetensors +from torch import autocast +from torchvision import transforms +from tqdm import tqdm + +from vidtok.modules.lpips import LPIPS +from vidtok.data.vidtok import VidTokValDataset +from vidtok.modules.util import instantiate_from_config, print0, compute_psnr, compute_ssim + + +def load_model_from_config(config, ckpt, verbose=False): + config = OmegaConf.load(config) + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Loading model from {ckpt}") + model = instantiate_from_config(config.model) + + if ckpt.endswith("ckpt"): + sd = torch.load(ckpt, map_location="cpu")["state_dict"] + elif ckpt.endswith("safetensors"): + sd = load_safetensors(ckpt) + else: + raise NotImplementedError(f"Unknown checkpoint: {ckpt}") + + new_sd = {} + for k, v in sd.items(): + if k.startswith("loss"): + continue + new_sd[k] = v + missing, unexpected = model.load_state_dict(new_sd, strict=False) + print0( + f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Restored from {ckpt} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + + if len(missing) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Unexpected Keys: {unexpected}") + return model + + +class MultiVideoDataset(VidTokValDataset): + def __init__( + self, + data_dir, + meta_path=None, + input_height=256, + input_width=256, + num_frames_per_batch=17, + sample_fps=30, + ): + super().__init__( + data_dir=data_dir, + meta_path=meta_path, + video_params={ + "input_height": input_height, + "input_width": input_width, + "sample_num_frames": num_frames_per_batch, + "sample_fps": sample_fps, + }, + pre_load_frames=True, + last_frames_handle="repeat", + ) + + def __getitem__(self, idx): + frames = super().__getitem__(idx)["jpg"] + return frames + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--data_dir", + type=str, + default="", + help="root folder", + ) + parser.add_argument( + "--meta_path", + type=str, + default=None, + help="path to the .csv meta file", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=17, + help="number of frames per batch", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset = MultiVideoDataset( + data_dir=args.data_dir, + meta_path=args.meta_path, + input_height=args.input_height, + input_width=args.input_width, + num_frames_per_batch=args.num_frames_per_batch, + sample_fps=args.sample_fps + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + perceptual_loss = LPIPS().eval() + perceptual_loss = perceptual_loss.to(device) + + psnrs, ssims, lpipss = [], [], [] + + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, output, *_ = model(input) + + output = output.clamp(-1, 1) + input, output = map(lambda x: (x + 1) / 2, (input, output)) + + if input.dim() == 5: + input = rearrange(input, "b c t h w -> (b t) c h w") + assert output.dim() == 5 + output = rearrange(output, "b c t h w -> (b t) c h w") + + psnr = compute_psnr(input, output) + ssim = compute_ssim(input, output) + lpips = perceptual_loss(input * 2 - 1, output * 2 - 1).mean() + + psnrs.append(psnr.item()) + ssims.append(ssim.item()) + lpipss.append(lpips.item()) + + toc = time.time() + print0( + f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] PSNR: {np.mean(psnrs):.4f}, SSIM: {np.mean(ssims):.4f}, LPIPS: {np.mean(lpipss):.4f}" + ) + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_reconstruct.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..e568df36bab66e6e0061aeaee05b2da000afc0ec --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_reconstruct.py @@ -0,0 +1,191 @@ +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import warnings +warnings.filterwarnings("ignore") + +import time +import numpy as np +from contextlib import nullcontext +from pathlib import Path + +import torch +from einops import rearrange +from lightning.pytorch import seed_everything +from torch import autocast +from torchvision.io import write_video +from tqdm import tqdm + +from vidtwin.scripts.inference_evaluate import print0, load_model_from_config, transforms, decord + + +class SingleVideoDataset(torch.utils.data.Dataset): + def __init__(self, video_path, input_height=128, input_width=128, num_frames_per_batch=16, sample_fps=8): + decord.bridge.set_bridge("torch") + self.video_path = video_path + normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + self.transform = transforms.Compose( + [ + transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x : x + num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x : x + num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255.0 + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=17, + help="number of frames per batch", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwininference_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + os.makedirs(args.output_video_dir, exist_ok=True) + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset = SingleVideoDataset( + args.input_video_path, args.input_height, args.input_width, args.num_frames_per_batch, args.sample_fps + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + inputs = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, xrec, *_ = model(input) + input = rearrange(input, "b c t h w -> (b t) c h w") + inputs.append(input) + xrec = rearrange(xrec, "b c t h w -> (b t) c h w") + outputs.append(xrec) + + toc = time.time() + + # save the outputs as videos + inputs = tensor_to_uint8(torch.cat(inputs, dim=0)) + inputs = rearrange(inputs, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs.shape[0], outputs.shape[0]) + final = np.concatenate([inputs[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"{Path(args.input_video_path).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + + print0(f"[bold red]Results saved in: {output_video_path}[/bold red]") + print0(f"[bold red]\[vidtwin.scripts.inference_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..69a88da7061da0f669fb62c64f0a7cea207700ce --- /dev/null +++ b/Meissonic/VidTok/vidtok_cache/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py @@ -0,0 +1,264 @@ +import argparse +import datetime +import glob +import inspect +import os +import re +import sys +import numpy as np +import warnings +warnings.filterwarnings("ignore") +from inspect import Parameter +from typing import Union +from matplotlib import pyplot as plt +from natsort import natsorted +from omegaconf import OmegaConf +from packaging import version +from PIL import Image +from pathlib import Path +from tqdm import tqdm + +import torch +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch import seed_everything +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +import decord +import time +from einops import rearrange +from contextlib import nullcontext +from torch import autocast +from torchvision import transforms +from torchvision.utils import save_image +from torchvision.io import write_video +from safetensors.torch import load_file as load_safetensors + +from vidtok.modules.util import instantiate_from_config, print0 + + +def load_model_from_config(config, ckpt, verbose=False): + config = OmegaConf.load(config) + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Loading model from {ckpt}") + model = instantiate_from_config(config.model) + + if ckpt.endswith("ckpt"): + sd = torch.load(ckpt, map_location="cpu")["state_dict"] + elif ckpt.endswith("safetensors"): + sd = load_safetensors(ckpt) + else: + raise NotImplementedError(f"Unknown checkpoint: {ckpt}") + + missing, unexpected = model.load_state_dict(sd, strict=False) + print0( + f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Restored from {ckpt} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Unexpected Keys: {unexpected}") + return model + + +class VideoDataset(torch.utils.data.Dataset): + def __init__(self, video_path, input_height=128, input_width=128, sample_fps=8, num_frames_per_batch=16): + decord.bridge.set_bridge("torch") + self.video_path = video_path + normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + self.transform = transforms.Compose([transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + normalize,]) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x:x+num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x:x+num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255. + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", + type=str, + help="evaluate at this precision", + choices=["full", "autocast"], + default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/tvae3d/webvid_kl_f_16_128_884_8chn_80G4.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="xxxxx.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path_structure", + type=str, + default="logs/assets/Nik.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_video_path_dynamics", + type=str, + default="logs/assets/Nik.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=4, + help="", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=16, + help="", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + parser.add_argument( + "--dynamics_split", + type=str2bool, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + os.makedirs(args.output_video_dir, exist_ok=True) + print(args.ckpt) + print(args.config) + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset_structure = VideoDataset(args.input_video_path_structure, args.input_height, args.input_width, args.sample_fps, args.num_frames_per_batch) + dataset_dynamics = VideoDataset(args.input_video_path_dynamics, args.input_height, args.input_width, args.sample_fps, args.num_frames_per_batch) + min_len = min(len(dataset_structure), len(dataset_dynamics)) + dataset_structure = torch.utils.data.Subset(dataset_structure, range(min_len)) + dataset_dynamics = torch.utils.data.Subset(dataset_dynamics, range(min_len)) + dataloader_structure = torch.utils.data.DataLoader(dataset_structure, batch_size=1, shuffle=False) + dataloader_dynamics = torch.utils.data.DataLoader(dataset_dynamics, batch_size=1, shuffle=False) + + inputs_structure = [] + inputs_dynamics = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input_structure, input_dynamics in zip(tqdm(range(min_len)), dataloader_structure, dataloader_dynamics): + if input_structure.shape[2] <= 5: + continue + input_structure = input_structure.to(device) + input_dynamics = input_dynamics.to(device) + if args.dynamics_split: + z, z_structure, *_ = model.encode(input_structure) + _, _, z_dynamics_x, z_dynamics_y = model.encode(input_dynamics) + xrec = model.decode(z, z_structure, z_dynamics_x, z_dynamics_y) + else: + z, z_structure, *_ = model.encode(input_structure) + _, _, z_dynamics = model.encode(input_dynamics) + xrec = model.decode(z, z_structure, z_dynamics) + input_structure = rearrange(input_structure, "b c t h w -> (b t) c h w") + inputs_structure.append(input_structure) + input_dynamics = rearrange(input_dynamics, "b c t h w -> (b t) c h w") + inputs_dynamics.append(input_dynamics) + xrec = rearrange(xrec, "b c t h w -> (b t) c h w") + outputs.append(xrec) + toc = time.time() + + # save the outputs as videos + inputs_structure = tensor_to_uint8(torch.cat(inputs_structure, dim=0)) + inputs_structure = rearrange(inputs_structure, "t c h w -> t h w c") + inputs_dynamics = tensor_to_uint8(torch.cat(inputs_dynamics, dim=0)) + inputs_dynamics = rearrange(inputs_dynamics, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs_structure.shape[0],inputs_dynamics.shape[0], outputs.shape[0]) + final = np.concatenate([inputs_structure[:min_len], inputs_dynamics[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"structure_{Path(args.input_video_path_structure).stem}_dynamics_{Path(args.input_video_path_dynamics).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Saved the reconstructed video to {output_video_path}") + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_0.png b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_0.png new file mode 100644 index 0000000000000000000000000000000000000000..25f94a0bfadb3c6de3c8dc4cd7865a255f827c97 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e334468dc488b4ecf5b84cf1d3dc93ae69f73c71d2181053ab00e8958c6a91f +size 1942872 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_1.png b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_1.png new file mode 100644 index 0000000000000000000000000000000000000000..80541f421259e2ae4eb40a74cc71238d42fe06e4 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0818fb2fc865b7a9c7d435af907df9d802ae93bfaf6c507721216257900f06eb +size 2453329 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_2.png b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_2.png new file mode 100644 index 0000000000000000000000000000000000000000..61bc7459eea99fa7eb086683dad3d70e62ef253d --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01cebfe13ca8976bc57c7dcb44e66f9f114fb263d9f6f544217a6eba6a2a55dd +size 2556324 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_3.png b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_3.png new file mode 100644 index 0000000000000000000000000000000000000000..d8f65c02acf53e965978f0105f860971ccc0e69c --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_grid_video_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d630a54b100b516aacc4de215c6adb8b73a66d4a37941f0520bc5f49584af3e +size 2589245 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_video_0.mp4 b/Meissonic/VidTok/vidtok_test_output/comparison_video_0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8ae649f344ad343b96b43820b53b51d382517c3c --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_video_0.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bcc3fd695035e6d8a6f257b4c23b21d7b69637fb0c7d482f8a106423e0573ed +size 168923 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_video_1.mp4 b/Meissonic/VidTok/vidtok_test_output/comparison_video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0e0bde46dfa34debc2e50d42306013566e54a276 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fd0b2311b490799703646dfca094762df957470b40955ab738592eed449d60a +size 544867 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_video_2.mp4 b/Meissonic/VidTok/vidtok_test_output/comparison_video_2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b0a3b8153324c607f247796c35086565cd3d4867 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_video_2.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3132a14282c5202fe7413c22d6521d68aaa65973e698fa36f9a7357726f4892c +size 710836 diff --git a/Meissonic/VidTok/vidtok_test_output/comparison_video_3.mp4 b/Meissonic/VidTok/vidtok_test_output/comparison_video_3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8d84f347a387137da2662d6b6705cb3ba997b8d3 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/comparison_video_3.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b46a1b634fd3b532678c1bd8040b5607dea8ae7a886a8d827f8bc06012020c +size 446644 diff --git a/Meissonic/VidTok/vidtok_test_output/metrics_video_0.txt b/Meissonic/VidTok/vidtok_test_output/metrics_video_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..91a8a059338683cdeac03da2f7c3281ac216b0ea --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/metrics_video_0.txt @@ -0,0 +1,15 @@ +Video Index: 0 +Video Path: 000/000/000/0.mp4 +Caption: In the video, a man is seen in a living room setting, standing in front of a window with blinds. He is wearing a black sweater and appears to be in the middle of a conversation. The room is dimly lit, with a lamp providing a soft glow in the background. The man's expression is serious, suggesting that the conversation is of importance. The overall style of the video is realistic and naturalistic, capturing a candid moment in the man's life. +Model Config: None +Model Checkpoint: microsoft/VidTok +Use Continuous Latent: False + +=== Metrics === +Average PSNR: 35.84 dB +Average MSE: 0.000281 +Average SSIM: 0.9981 + +Per-frame PSNR: [33.037803649902344, 31.485530853271484, 34.3515739440918, 36.649356842041016, 36.190059661865234, 35.286495208740234, 35.109378814697266, 37.02665710449219, 36.84290313720703, 35.97552490234375, 36.14148712158203, 36.9006233215332, 36.93511199951172, 36.49894714355469, 36.815330505371094, 37.31867599487305, 36.788856506347656] +Per-frame MSE: [0.0004968433640897274, 0.0007103082025423646, 0.00036714912857860327, 0.0002163038298022002, 0.0002404329861747101, 0.0002960400888696313, 0.00030836285441182554, 0.00019830527890007943, 0.0002068757457891479, 0.00025260841357521713, 0.00024313709582202137, 0.00020414454047568142, 0.0002025298454100266, 0.00022392638493329287, 0.000208193450816907, 0.0001854096626630053, 0.00020946632139384747] +Per-frame SSIM: [0.9971727132797241, 0.9959272146224976, 0.9976918697357178, 0.99843430519104, 0.9982582926750183, 0.9978524446487427, 0.997775137424469, 0.9985707998275757, 0.9985087513923645, 0.9981595873832703, 0.9982331395149231, 0.9985169768333435, 0.9985323548316956, 0.9983784556388855, 0.9984921216964722, 0.9986546039581299, 0.9984803199768066] diff --git a/Meissonic/VidTok/vidtok_test_output/metrics_video_1.txt b/Meissonic/VidTok/vidtok_test_output/metrics_video_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e8f6d8a2ff046073344eae40d49cf66369696ea --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/metrics_video_1.txt @@ -0,0 +1,15 @@ +Video Index: 1 +Video Path: 000/000/001/1.mp4 +Caption: The video shows a man standing next to a purple van with a floral design on the side. The man is wearing a black t-shirt and jeans, and he is smiling and waving his hands in the air. The van has pink rims and a black roof rack. The van is parked in front of a building with a glass door. The man appears to be happy and excited about the van. The video is likely a short clip of a man showing off his van. +Model Config: None +Model Checkpoint: microsoft/VidTok +Use Continuous Latent: False + +=== Metrics === +Average PSNR: 26.30 dB +Average MSE: 0.002472 +Average SSIM: 0.9768 + +Per-frame PSNR: [31.058448791503906, 26.61920738220215, 25.658832550048828, 26.627016067504883, 25.40911865234375, 24.536104202270508, 24.99111557006836, 26.337379455566406, 25.426624298095703, 24.527191162109375, 25.4383487701416, 26.555984497070312, 25.69927215576172, 25.481670379638672, 26.91702651977539, 28.13100814819336, 27.644102096557617] +Per-frame MSE: [0.0007837092853151262, 0.0021781064569950104, 0.0027171701658517122, 0.0021741949021816254, 0.0028779818676412106, 0.003518760437145829, 0.003168752882629633, 0.0023241378366947174, 0.0028664045967161655, 0.0035259875003248453, 0.0028586769476532936, 0.00221004756167531, 0.0026919858064502478, 0.002830302808433771, 0.002033749595284462, 0.0015377980889752507, 0.001720242784358561] +Per-frame SSIM: [0.9923760294914246, 0.9781950116157532, 0.9733944535255432, 0.9788379669189453, 0.9719116687774658, 0.966019868850708, 0.9692208766937256, 0.9775781035423279, 0.972322404384613, 0.9663904905319214, 0.9733926653862, 0.9801130294799805, 0.9764418005943298, 0.9755927920341492, 0.9825443625450134, 0.986733615398407, 0.985124409198761] diff --git a/Meissonic/VidTok/vidtok_test_output/metrics_video_2.txt b/Meissonic/VidTok/vidtok_test_output/metrics_video_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..b052266b75f16a84040de8e83027a0f765145ae4 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/metrics_video_2.txt @@ -0,0 +1,15 @@ +Video Index: 2 +Video Path: 000/000/002/2.mp4 +Caption: The video is a news segment featuring a man in a red baseball cap and a blue vest, standing in front of a statue of a soldier and two children. The man appears to be a veteran, as indicated by the cap and the context of the event. The event is an honorary ceremony for lost submarines and submarine veterans, taking place near the World Peace Bell in Newport. The news segment is titled "Connected to the Community" and is scheduled to air at 11:10 PM on ABC 9. The style of the video is informative and respectful, focusing on the man and the event, with a clear and concise presentation of the details. +Model Config: None +Model Checkpoint: microsoft/VidTok +Use Continuous Latent: False + +=== Metrics === +Average PSNR: 23.94 dB +Average MSE: 0.004205 +Average SSIM: 0.9741 + +Per-frame PSNR: [24.750844955444336, 23.870845794677734, 24.874595642089844, 24.559537887573242, 24.796916961669922, 24.477529525756836, 24.734580993652344, 24.929773330688477, 21.87407684326172, 20.827701568603516, 22.117740631103516, 23.532190322875977, 23.710277557373047, 24.223182678222656, 24.45770263671875, 24.587888717651367, 24.69757080078125] +Per-frame MSE: [0.0033490026835352182, 0.004101242404431105, 0.0032549200113862753, 0.0034998231567442417, 0.0033136624842882156, 0.0035665403120219707, 0.0033615671563893557, 0.003213828429579735, 0.006495194975286722, 0.00826475489884615, 0.006140812765806913, 0.004433850757777691, 0.004255711566656828, 0.0037816548720002174, 0.003582859644666314, 0.0034770509228110313, 0.0033903378061950207] +Per-frame SSIM: [0.9781184792518616, 0.9733392596244812, 0.9781230688095093, 0.9757063388824463, 0.9766383171081543, 0.9744651317596436, 0.9753068089485168, 0.9758992195129395, 0.962554931640625, 0.9547202587127686, 0.966240644454956, 0.9752574563026428, 0.9757081270217896, 0.978428840637207, 0.979464590549469, 0.9795681834220886, 0.9804643392562866] diff --git a/Meissonic/VidTok/vidtok_test_output/metrics_video_3.txt b/Meissonic/VidTok/vidtok_test_output/metrics_video_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..54f1276d060652049d46a5f617c98c813e05f2a6 --- /dev/null +++ b/Meissonic/VidTok/vidtok_test_output/metrics_video_3.txt @@ -0,0 +1,15 @@ +Video Index: 3 +Video Path: 000/000/003/3.mp4 +Caption: The video features a man in a pink shirt and a black bucket hat, wearing glasses and a necklace. He is holding a spoon and making a playful face, as if he is about to eat something. The background shows a lush garden with trees and a wooden structure. The man's expression and the spoon suggest that he is about to taste something, possibly food. The overall style of the video is casual and fun, with a focus on the man's reaction to the food. +Model Config: None +Model Checkpoint: microsoft/VidTok +Use Continuous Latent: False + +=== Metrics === +Average PSNR: 27.69 dB +Average MSE: 0.001723 +Average SSIM: 0.9898 + +Per-frame PSNR: [28.46287727355957, 28.55240249633789, 27.580657958984375, 27.244449615478516, 27.244640350341797, 25.913394927978516, 26.88213348388672, 27.892009735107422, 27.893531799316406, 27.678852081298828, 28.061325073242188, 28.302602767944336, 28.286983489990234, 27.337810516357422, 27.708032608032227, 28.11959457397461, 27.486038208007812] +Per-frame MSE: [0.001424663234502077, 0.0013955960748717189, 0.0017455582274124026, 0.0018860582495108247, 0.0018859754782170057, 0.0025624786503612995, 0.0020501543767750263, 0.001624796655960381, 0.0016242277342826128, 0.0017065333668142557, 0.0015626702224835753, 0.001478222431614995, 0.0014835483161732554, 0.0018459452548995614, 0.0016951053403317928, 0.0015418441034853458, 0.0017840052023530006] +Per-frame SSIM: [0.9914306998252869, 0.991628885269165, 0.9895283579826355, 0.9887038469314575, 0.9888691306114197, 0.9845649003982544, 0.9880149364471436, 0.9904597401618958, 0.9903984665870667, 0.9898774027824402, 0.9908393025398254, 0.9913633465766907, 0.9912683367729187, 0.9891148805618286, 0.9899492263793945, 0.9908106923103333, 0.9893209338188171] diff --git a/Meissonic/VidTok/vidtwin/README.md b/Meissonic/VidTok/vidtwin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e475c6eade763f9cef3b67a905dd738e1fb43fef --- /dev/null +++ b/Meissonic/VidTok/vidtwin/README.md @@ -0,0 +1,211 @@ + + +

+VidTwin VidTwin: Video VAE with Decoupled Structure and Dynamics (CVPR 2025) +

+

+ Yuchi Wang   + Junliang Guo   + Xinyi Xie   + Tianyu He   + Xu Sun   + Jiang Bian +

+ +
+ +
+ +[![arXiv](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2412.17726)   [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/microsoft/vidtwin)   [![Static Badge](https://img.shields.io/badge/Demo-Project_Page-yellow)](https://vidtwin.github.io/) + +

🔥 Check our Demo Page for enhanced visual experience.

+ + +
+
+
+ +We propose a novel and compact video autoencoder, VidTwin, that decouples video into two distinct latent spaces: **Structure latent vectors**, which capture overall content and global movement, and **Dynamics latent vectors**, which represent fine-grained details and rapid movements. + +Extensive experiments show that VidTwin achieves a high compression rate of 0.20% with high reconstruction quality (PSNR of 28.14 on the MCL-JCV dataset), and performs efficiently and effectively in downstream generative tasks. Moreover, our model demonstrates explainability and scalability, paving the way for future research in video latent representation and generation. +
+
+ +
+ +## Setup + +1. Our code is based on **VidTok**, so you will need to install the [required packages for VidTok](https://github.com/microsoft/VidTok?tab=readme-ov-file#setup) first. To do so, navigate to the VidTok folder and create the environment using the `environment.yaml` file: + +```bash +cd VidTok +# Prepare conda environment +conda env create -f environment.yaml +# Activate the environment +conda activate vidtok +``` + +2. After setting up VidTok, install the additional packages required for the VidTwin model: +```bash +pip install tranformers +pip install timm +pip install flash-attn --no-build-isolation +``` + + +## Training + +### Data Preparation + +We follow the same approach as **VidTok** to prepare the data. You can also find the Dataloader class in: `vidtok/data/vidtok.py`. This Dataloader is a general-purpose class for handling video data. You may customize it to suit your own dataset and specific use cases. + +1. Put all training videos under `DATA_DIR`: +``` +└── DATA_DIR + ├── subset1 + │ ├── videoname11.mp4 + │ └── videoname12.mp4 + ├── subset2 + │ ├── videoname21.mp4 + │ ├── videoname22.mp4 + │ └── subsubset1 + │ ├── videoname211.mp4 + │ └── videoname212.mp4 + └── ... +``` +2. Prepare a `.csv` meta file to record the relative paths of these videos with respect to `DATA_DIR`, like: +``` +videos +subset1/videoname11.mp4 +subset2/videoname21.mp4 +subset2/subsubset1/videoname211.mp4 +``` + +> Validation data is also prepared following the above steps. + +### Launch Training + +1. Specify the Configuration File + +Our code follows a **modular design**, allowing you to easily customize the model structure and training settings by modifying a configuration file. For the **VidTwin** model, we provide the following configuration file:`configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml`. + +- In the **Model** section of the configuration file, you can specify the model's structure and key hyperparameters. For instance, you can adjust the following settings: + +```yaml +model: + params: + expect_ch: 8 # the dimension of the Structure Latent, d_S + cont_num_blocks: 1 # downsample blocks of the Structure Latent, 1 -> h_S = 7, 2 -> h_S = 4, 3 -> h_S = 2 + downsample_motion: True + motion_num_blocks: 1 # downsample blocks of the Dynamics Latent, 1 -> h_D = 7, 2 -> h_D = 4, 3 -> h_D = 2 + d_dim: 8 # the dimension of the Dynamics Latent, d_D +``` + +- If you'd like to **fine-tune** the model from a pre-trained checkpoint instead of training from scratch, you can specify the `ckpt_path` parameter in the configuration file. + +```yaml +model: + params: + ckpt_path: PATH_TO_CHECKPOINT # train from existing checkpoint +``` + +- In the **Data** section of the configuration file, you can specify paths and other important data-related hyperparameters. + +```yaml +train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 # 224 for our VidTwin model + input_width: INPUT_WIDTH_1 # 224 for our VidTwin model + sample_num_frames: NUM_FRAMES_1 # set to 16 for our VidTwin model + sample_fps: SAMPLE_FPS_1 # sample fps for training data, 8 for VidTwin model +validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 # 224 for our VidTwin model + input_width: INPUT_WIDTH_2 # 224 for our VidTwin model + sample_num_frames: NUM_FRAMES_2 # set to 16 for our VidTwin model + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data +``` + +2. Run the following command to start training: +```bash +python main.py -b CONFIG --logdir LOGDIR + +# You can also use `torchrun` to start the training code. +``` +Training logs and checkpoints are saved in `LOGDIR`. + +It is recommended to use [Weights & Biases](https://wandb.ai/site) as the data visualization tool ([TensorBoard](https://www.tensorflow.org/tensorboard) by default). Use `wandb login` to log in first, and then run: +``` +python main.py -b CONFIG --logdir LOGDIR --wandb --wandb_entity ENTITY --wandb_project PROJECT +``` + +## Inference + + +### Easy Usage +We provide the following example for a quick usage of our models. After downloaded the checkpoint from our [Huggingface page](https://huggingface.co/microsoft/vidtwin), just provide the path of the configuration file `cfg_path` and checkpoint file `ckpt_path` to the script. +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +cfg_path = "configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml" +ckpt_path = "checkpoints/vidtwin_structure_7_7_8_dynamics_7_8.ckpt" + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to(device).eval() +# random input +num_frames = 16 +x_input = (torch.rand(1, 3, num_frames, 224, 224) * 2 - 1).to(device) # [B, C, T, H, W], range -1~1 +# model forward +_, x_recon, *_ = model(x_input) +assert x_input.shape == x_recon.shape +``` + +### Reconstruct an Input Video +```bash +python vidtwin/scripts/inference_reconstruct.py --config CONFIG --ckpt CKPT --input_video_path VIDEO_PATH --num_frames_per_batch NUM_FRAMES_PER_BATCH --input_height 224 --input_width 224 --sample_fps 25 --output_video_dir OUTPUT_DIR +``` +- Specify `VIDEO_PATH` to the path of your test video. We provide an example video in `assets/example.mp4`. +- Set `NUM_FRAMES_PER_BATCH` to `16. +- The reconstructed video is saved in `OUTPUT_DIR`. + +### Performance Evaluation +We also provide a manuscript `scripts/inference_evaluate.py` to evaluate the video reconstruction performance in PSNR, SSIM and LPIPS. + +1. Put all of your test videos under `DATA_DIR`. +2. Run the following command, and all `.mp4` videos under `DATA_DIR` will be tested: +```bash +python vidtwin/scripts/inference_evaluate.py --config CONFIG --ckpt CKPT --data_dir DATA_DIR --num_frames_per_batch NUM_FRAMES_PER_BATCH --input_height 224 --input_width 224 --sample_fps 25 +``` +(Optional) If you only want to test certain videos under `DATA_DIR`, you need to prepare a `.csv` meta file +to indicate the video files to be tested (refer to [Data Preparation](#data-preparation)). And add `--meta_path META_PATH` to the above command to specify the path to the `.csv` meta file. + + + +### Cross-reenactment Reconstruction + +For VidTwin model, we conduct a cross-reenactment experiment in which we combine the *Structure Latent* from one video, $A$, with the *Dynamics Latent* from another video, $B$, to observe the generated output from the decoder, i.e., generating $\mathcal{D}(u^A_{\boldsymbol{S}}, u^B_{\boldsymbol{D}})$. + +To facilitate this experiment, we provide the script `vidtwin/scripts/inference_vidtwin_cross_reconstruct.py`. This script follows a similar usage method to `vidtwin/scripts/inference_reconstruct.py` with the addition of two new arguments: `--input_video_path_structure` and `--input_video_path_dynamics`, which allow you to specify the videos for structure and dynamics information, respectively. + +## BibTeX +If you find our project helpful to your research, please consider starring this repository🌟 and citing our paper. +```bibtex +@article{wang2024vidtwin, + title={VidTwin: Video VAE with Decoupled Structure and Dynamics}, + author={Wang, Yuchi and Guo, Junliang and Xie, Xinyi and He, Tianyu and Sun, Xu and Bian, Jiang}, + year={2024}, + journal={arXiv preprint arXiv:2412.17726}, +} +``` diff --git a/Meissonic/VidTok/vidtwin/models/vidtwin_ae.py b/Meissonic/VidTok/vidtwin/models/vidtwin_ae.py new file mode 100644 index 0000000000000000000000000000000000000000..22daf6cae27b2731021490adf8a4b8cb41dc5a4b --- /dev/null +++ b/Meissonic/VidTok/vidtwin/models/vidtwin_ae.py @@ -0,0 +1,1604 @@ +import os +import re +import math +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union + +import lightning.pytorch as pl +import torch +import einops +from omegaconf import ListConfig +from packaging import version +from safetensors.torch import load_file as load_safetensors + +from torch.optim.lr_scheduler import _LRScheduler, LambdaLR, StepLR +from vidtok.modules.util import default, instantiate_from_config, print0, get_valid_paths +from vidtok.modules.util import compute_psnr, compute_ssim +from vidtok.models.autoencoder import AbstractAutoencoder +import numpy as np +from torch import nn +from einops import rearrange, repeat +import transformers + + +class VidAutoEncoderQformerBase(AbstractAutoencoder): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def init_from_ckpt( + self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple() + ) -> None: + if path.endswith("ckpt"): + # sd = torch.load(path, map_location="cpu")["state_dict"] + ckpt = torch.load(path, map_location="cpu") + if "state_dict" in ckpt: + sd = ckpt["state_dict"] + else: + sd = ckpt + elif path.endswith("safetensors"): + sd = load_safetensors(path) + else: + raise NotImplementedError + + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Deleting key {k} from state_dict.") + del sd[k] + + for k, tensor in sd.items(): + sd[k] = tensor.to(torch.float64) + + missing, unexpected = self.load_state_dict(sd, strict=False) + print0( + f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Unexpected Keys: {unexpected}") + + def get_input(self, batch: Dict) -> torch.Tensor: + # assuming unified data format, dataloader returns a dict. + # image tensors should be scaled to -1 ... 1 and in channels-first format (e.g., bchw instead if bhwc) + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(self.encoder.parameters()) + + list(self.decoder.parameters()) + + list(self.get_disentangle_params()) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) # e.g., discriminator + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + # See https://github.com/Lightning-AI/pytorch-lightning/issues/17801 and https://lightning.ai/docs/pytorch/stable/common/optimization.html for the reason of this change + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + z, xrec, regularization_log, *_ = self(x) + opt_g, opt_d = self.optimizers() + sch1, sch2 = self.lr_schedulers() + + + # autoencode loss + self.toggle_optimizer(opt_g) + # adversarial loss is binary cross-entropy + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + opt_g.step() + sch1.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + # adversarial loss is binary cross-entropy + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + opt_d.step() + + sch2.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + z, xrec, regularization_log, *_ = self(x) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + + # evaluate the psnr and ssim + x = x.clamp(-1, 1) + xrec = xrec.clamp(-1, 1) + x = (x + 1) / 2 + xrec = (xrec + 1) / 2 + psnr = compute_psnr(xrec, x) + ssim = compute_ssim(xrec, x) + + self.log(f"val{postfix}/psnr", psnr, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log(f"val{postfix}/ssim", ssim, prog_bar=True, logger=True, on_step=True, on_epoch=True) + return log_dict_ae + + def configure_optimizers(self): + if self.trainable_ae_params is None: + ae_params = self.get_autoencoder_params() + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable autoencoder parameters: {len(ae_params):,}") + else: + ae_params, num_ae_params = self.get_param_groups( + self.trainable_ae_params, self.ae_optimizer_args + ) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable autoencoder parameters: {num_ae_params:,}") + if self.trainable_disc_params is None: + disc_params = self.get_discriminator_params() + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable discriminator parameters: {len(disc_params):,}") + else: + disc_params, num_disc_params = self.get_param_groups( + self.trainable_disc_params, self.disc_optimizer_args + ) + print0( + f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable discriminator parameters: {num_disc_params:,}" + ) + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + + if len(disc_params) > 0: + opt_disc = self.instantiate_optimizer_from_config( + disc_params, self.learning_rate, self.optimizer_config + ) + + lr_freq1 = 1 + lr_freq2 = 1 + if not self.use_scheduler_g: + total_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + scheduler1 = ConstantWarmupScheduler(opt_ae, warmup_steps=500, total_steps=total_steps) + else: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use generator lr scheduler: {self.lr_scheduler_config_g.target}") + lr_freq1 = self.lr_scheduler_config_g.params.frequency if hasattr(self.lr_scheduler_config_g.params, 'frequency') else 1 + max_decay_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler max_decay_steps: {max_decay_steps}") + if 'inverse_sqrt' in self.lr_scheduler_config_g.target: + scheduler1 = transformers.get_inverse_sqrt_schedule(optimizer=opt_ae, num_warmup_steps=self.lr_scheduler_config_g.params.num_warmup_steps) + elif 'LambdaWarmUpCosineScheduler' in self.lr_scheduler_config_g.target: + scheduler1 = LambdaWarmUpCosineScheduler(optimizer=opt_ae, total_steps=max_decay_steps, **self.lr_scheduler_config_g.params) + elif 'LinearWarmupScheduler' in self.lr_scheduler_config_g.target: + scheduler1 = LinearWarmupScheduler(opt_ae, total_steps=max_decay_steps, **self.lr_scheduler_config_g.params) + else: + scheduler1 = instantiate_lrscheduler_from_config(opt_ae, self.lr_scheduler_config_g, total_steps=max_decay_steps) + + if not self.use_scheduler_d: + total_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + scheduler2 = ConstantWarmupScheduler(opt_disc, warmup_steps=500, total_steps=total_steps) + else: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler: {self.lr_scheduler_config_d.target}") + lr_freq2 = self.lr_scheduler_config_d.params.frequency if hasattr(self.lr_scheduler_config_d.params, 'frequency') else 1 + max_decay_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler max_decay_steps: {max_decay_steps}") + if 'inverse_sqrt' in self.lr_scheduler_config_d.target: + scheduler2 = transformers.get_inverse_sqrt_schedule(optimizer=opt_disc, num_warmup_steps=self.lr_scheduler_config_d.params.num_warmup_steps) + elif 'LambdaWarmUpCosineScheduler' in self.lr_scheduler_config_d.target: + scheduler2 = LambdaWarmUpCosineScheduler(optimizer=opt_disc, total_steps=max_decay_steps, **self.lr_scheduler_config_d.params) + elif 'LinearWarmupScheduler' in self.lr_scheduler_config_d.target: + scheduler2 = LinearWarmupScheduler(opt_disc, total_steps=max_decay_steps, **self.lr_scheduler_config_d.params) + else: + scheduler2 = instantiate_lrscheduler_from_config(opt_disc, self.lr_scheduler_config_d, total_steps=max_decay_steps) + + + lr_scheduler_config1 = { + "optimizer": opt_ae, + "lr_scheduler": { + "scheduler": scheduler1, + "name": "lr_generator", + "interval": "step", + "frequency": lr_freq1, + } + } + lr_scheduler_config2 = { + "optimizer": opt_disc, + "lr_scheduler": { + "scheduler": scheduler2, + "name": "lr_discriminator", + "interval": "step", + "frequency": lr_freq2, + } + } + return (lr_scheduler_config1, lr_scheduler_config2) + + @torch.no_grad() + def log_images(self, batch: Dict, **kwargs) -> Dict: # called at ImageLoggerCallback.log_img() + log = dict() + x = self.get_input(batch) + _, xrec, *_ = self(x) + log["inputs"] = x + log["reconstructions"] = xrec + return log + + +class VidAutoEncoderQformer(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + height_qformer_config: Dict, + width_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.hight_qformer = instantiate_from_config(height_qformer_config) + self.width_qformer = instantiate_from_config(width_qformer_config) + + + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + # (bhw, f, c) -> (bhw, f',c') + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.height_emb = nn.Sequential( + nn.Linear(height_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(height_qformer_config.params.num_query_tokens, self.patch_nums[1], 1), + nn.ReLU(), + ) + + self.width_emb = nn.Sequential( + nn.Linear(width_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(width_qformer_config.params.num_query_tokens, self.patch_nums[2], 1), + nn.ReLU(), + ) + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.hight_qformer.parameters()) + + list(self.width_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.height_emb.parameters()) + + list(self.width_emb.parameters()) + ) + + return params + + + def decode(self, z, z_content, z_motion_x, z_motion_y) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h', w', c_q) + z_motion_x: shape (b, f, h_q, w', c_q) + z_motion_y: shape (b, f, h', w_q, c_q) + ''' + z_content = rearrange(z_content, 'B F H W C -> (B H W) F C') + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + z_motion_x = rearrange(z_motion_x, 'B F H W C -> (B F W) H C') + vx = rearrange(self.height_emb(z_motion_x), '(B F W) H C -> B C F H W', F=z.size(2), W=z.size(4)) + z_motion_y = rearrange(z_motion_y, 'B F H W C -> (B F H) W C') + vy = rearrange(self.width_emb(z_motion_y), '(B F H) W C -> B C F H W', F=z.size(2), H=z.size(3)) + c_plus_m = vt + vx + vy # shape (b, c', f, h', w') + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + z_content = self.temporal_qformer(rearrange(z, 'B C F H W -> (B H W) F C')) + z_content = rearrange(z_content, '(B H W) F C -> B F H W C', H=z.size(3), W=z.size(4)) # compressed in the temporal dimension + z_motion_x = self.hight_qformer(rearrange(z, 'B C F H W -> (B F W) H C')) + z_motion_x = rearrange(z_motion_x, '(B F W) H C -> B F H W C', F=z.size(2), W=z.size(4)) # compressed in the height dimension + z_motion_y = self.width_qformer(rearrange(z, 'B C F H W -> (B F H) W C')) + z_motion_y = rearrange(z_motion_y, '(B F H) W C -> B F H W C', F=z.size(2), H=z.size(3)) # compressed in the width dimension + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, None + return z, z_content, z_motion_x, z_motion_y + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class VidAutoEncoderQformerCompact(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + space_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + retain_num_frames: bool = True, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + repeat_for_decoder: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.space_qformer = instantiate_from_config(space_qformer_config) + + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + self.repeat_for_decoder = repeat_for_decoder + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.down_channel_temp = nn.Linear(self.hidden_dim, self.temporal_down_dim) + self.up_channel_temp = nn.Linear(self.temporal_down_dim, self.hidden_dim) + self.pre_temporal_qformer = nn.Sequential( + nn.Linear(self.temporal_down_dim * self.patch_nums[1] * self.patch_nums[2], self.hidden_dim), + nn.ReLU(), + ) + self.retain_num_frames = retain_num_frames + if not self.retain_num_frames: + self.pre_spatial_qformer = nn.Sequential( + nn.Linear(self.hidden_dim * self.patch_nums[0], 2 * self.hidden_dim), + nn.ReLU(), + nn.Linear(2 * self.hidden_dim, self.hidden_dim), + nn.ReLU(), + ) + if self.repeat_for_decoder: + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + # (bhw, f, c) -> (bhw, f',c') + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.temporal_down_dim * self.patch_nums[1] * self.patch_nums[2]), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + if retain_num_frames: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim * self.patch_nums[0]), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.space_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.spatial_emb.parameters()) + + list(self.pre_temporal_qformer.parameters()) + + list(self.down_channel_temp.parameters()) + ) + if not self.retain_num_frames: + params += list(self.pre_spatial_qformer.parameters()) + if not self.repeat_for_decoder: + params += list(self.up_channel_temp.parameters()) + return params + + def decode(self, z, z_content, z_motion, only_part=None) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + if self.repeat_for_decoder: + z_content = repeat(z_content, 'B F C -> B f F C', f=z.size(2)) + vt = rearrange(self.cont_emb(rearrange(z_content, 'B F A d -> (B F) A d')), '(B f) (H W) C -> B C f H W', H=z.size(3), W=z.size(4), f=z.size(2)) + else: + vt = rearrange(self.cont_emb(z_content), 'B F (C H W) -> B C F H W', H=z.size(3), W=z.size(4)) + vt = self.up_channel_temp(vt.transpose(1, -1)).transpose(1, -1) + if self.retain_num_frames: + vs = rearrange(self.spatial_emb(rearrange(z_motion, 'B F X Y -> (B F) X Y')), '(B F) (H W) C -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + else: + vs = rearrange(self.spatial_emb(z_motion), 'B (H W) (F C) -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + + if self.partial_content_motion == 'content': + c_plus_m = vt + elif self.partial_content_motion == 'motion': + c_plus_m = vs + else: + c_plus_m = vt + vs # shape (b, c', f, h', w') + if only_part == 'content': + c_plus_m = vt + elif only_part == 'motion': + c_plus_m = vs + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = self.pre_temporal_qformer(rearrange(self.down_channel_temp(rearrange(z_shuffled, 'B C F H W -> B F H W C')), 'B F H W C -> B F (H W C)')) + else: + pre_qformer = self.pre_temporal_qformer(rearrange(self.down_channel_temp(rearrange(z, 'B C F H W -> B F H W C')), 'B F H W C -> B F (H W C)')) + z_content = self.temporal_qformer(pre_qformer) # shape (b, f_q, d_q) + layer_norm_content = nn.LayerNorm(z_content.size(-1)).to(z_content.device) + z_content = layer_norm_content(z_content) + + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z, 'B C F H W -> B (H W) (F C)'))) + layer_norm_motion = nn.LayerNorm(z_motion.size(-1)).to(z_motion.device) + z_motion = layer_norm_motion(z_motion) + if return_reg_log: + return z, z_content, z_motion, None + return z, z_content, z_motion + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion + + +class VidAutoEncoderQformerCompactSym(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + space_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + retain_num_frames: bool = True, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + expect_ch: int = 4, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.space_qformer = instantiate_from_config(space_qformer_config) + + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.retain_num_frames = retain_num_frames + if not self.retain_num_frames: + self.pre_spatial_qformer = nn.Sequential( + nn.Linear(self.hidden_dim * self.patch_nums[0], 2 * self.hidden_dim), + nn.ReLU(), + nn.Linear(2 * self.hidden_dim, self.hidden_dim), + nn.ReLU(), + ) + + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + if retain_num_frames: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim * self.patch_nums[0]), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.space_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.spatial_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + ) + if not self.retain_num_frames: + params += list(self.pre_spatial_qformer.parameters()) + + return params + + def decode(self, z, z_content, z_motion) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + if self.retain_num_frames: + vs = rearrange(self.spatial_emb(rearrange(z_motion, 'B F X Y -> (B F) X Y')), '(B F) (H W) C -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + else: + vs = rearrange(self.spatial_emb(z_motion), 'B (H W) (F C) -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + + if self.partial_content_motion == 'content': + c_plus_m = vt + elif self.partial_content_motion == 'motion': + c_plus_m = vs + else: + c_plus_m = vt + vs # shape (b, c', f, h', w') + + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z, 'B C F H W -> B (H W) (F C)'))) + if return_reg_log: + # return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion, None + return z, z_content, z_motion + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion + + +class VidAutoEncoderQformerCompactSymDis(VidAutoEncoderQformerCompactSym): + + def __init__( + self, + *args, + shuffle_content_ratio: float = 0.5, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.shuffle_content_ratio = shuffle_content_ratio + + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + # shuffle the content frames + x_shuffled = x.clone() + for i in range(x.size(0)): + randn_num = torch.rand(1) + if randn_num < self.shuffle_content_ratio: + idx = torch.randperm(x.size(2)) + x_shuffled[i] = x[i, :, idx, :, :] + x = torch.cat([x, x_shuffled], dim=0) + z = self.encoder(x) # shape (2b, c', f, h', w') + z_orig, z_shuffled = z.chunk(2, dim=0) + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z_orig, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z_orig, 'B C F H W -> B (H W) (F C)'))) + if return_reg_log: + # return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion, None + return z, z_content, z_motion + +class VidAutoEncoderQformerCompactSymVid(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + motion_num_blocks: int = 2, + expect_ch: int = 4, + d_dim: int = 16, + # space_qformer_config: Dict, + downsample_motion: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.d_dim = d_dim + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + self.motion_emb = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim), + nn.ReLU() + ) + self.motion_head = nn.Conv2d(self.hidden_dim, self.d_dim, kernel_size=3, stride=1, padding=1) + + self.downsample_motion = downsample_motion + if self.downsample_motion: + motion_downsample_blocks = [] + curr_resol = self.patch_nums[1] + for i in range(motion_num_blocks): + motion_downsample_blocks.append(nn.Conv2d(self.hidden_dim, self.hidden_dim, kernel_size=3, stride=2, padding=1)) + motion_downsample_blocks.append(nn.ReLU()) + curr_resol = (curr_resol + 1) // 2 + self.downsample_motion_module = nn.Sequential(*motion_downsample_blocks) + self.up_motion = nn.Sequential(nn.Linear(curr_resol, self.patch_nums[1]), + nn.ReLU(), + nn.Linear(self.patch_nums[1], self.patch_nums[1]), + nn.ReLU()) + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + list(self.motion_emb.parameters()) + + list(self.motion_head.parameters()) + + ) + if self.downsample_motion: + params += list(self.downsample_motion_module.parameters()) + params += list(self.up_motion.parameters()) + + return params + + def decode(self, z, z_content, z_motion_x, z_motion_y) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + vx = rearrange(self.motion_emb(rearrange(z_motion_x, 'B D F W -> B F W D')), 'B F W C -> B C F W') # shape (b, c', f, w') + vy = rearrange(self.motion_emb(rearrange(z_motion_y, 'B D F H -> B F H D')), 'B F H C -> B C F H') # shape (b, c', f, h') + if self.downsample_motion: + vx = self.up_motion(vx) + vy = self.up_motion(vy) + vx = repeat(vx, 'b c f w -> b c f h w', h=z.size(3)) + vy = repeat(vy, 'b c f h -> b c f h w', w=z.size(4)) + + c_plus_m = vt + vx + vy # shape (b, c', f, h', w') + + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + z_motion_x, z_motion_y = self.get_motion_latent(z) + + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, None + return z, z_content, z_motion_x, z_motion_y + + def get_motion_latent(self, z: torch.Tensor) -> torch.Tensor: + f = z.size(2) + if self.downsample_motion: + z = self.downsample_motion_module(rearrange(z, 'B C F H W -> (B F) C H W')) + z = rearrange(z, '(B F) C H W -> B C F H W', F=f) + ux = torch.mean(z, dim=-2) # shape (b, c', f, w') + uy = torch.mean(z, dim=-1) # shape (b, c', f, h') + zx = self.motion_head(ux) # shape (b, d, f, w') + zy = self.motion_head(uy) # shape (b, d, f, h') + return zx, zy + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class VidAutoEncoderQformerCompactSymVidVAE(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + motion_num_blocks: int = 2, + expect_ch: int = 4, + d_dim: int = 16, + downsample_motion: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.d_dim = d_dim + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, 2*expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + self.motion_emb = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim), + nn.ReLU() + ) + self.motion_head = nn.Conv2d(self.hidden_dim, 2*self.d_dim, kernel_size=3, stride=1, padding=1) + + self.downsample_motion = downsample_motion + if self.downsample_motion: + motion_downsample_blocks = [] + curr_resol = self.patch_nums[1] + for i in range(motion_num_blocks): + motion_downsample_blocks.append(nn.Conv2d(self.hidden_dim, self.hidden_dim, kernel_size=3, stride=2, padding=1)) + motion_downsample_blocks.append(nn.ReLU()) + curr_resol = (curr_resol + 1) // 2 + self.downsample_motion_module = nn.Sequential(*motion_downsample_blocks) + self.up_motion = nn.Sequential(nn.Linear(curr_resol, self.patch_nums[1]), + nn.ReLU(), + nn.Linear(self.patch_nums[1], self.patch_nums[1]), + nn.ReLU()) + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + list(self.motion_emb.parameters()) + + list(self.motion_head.parameters()) + + ) + if self.downsample_motion: + params += list(self.downsample_motion_module.parameters()) + params += list(self.up_motion.parameters()) + + return params + + + def decode(self, z, z_content, z_motion_x, z_motion_y, only_part=None) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + vx = rearrange(self.motion_emb(rearrange(z_motion_x, 'B D F W -> B F W D')), 'B F W C -> B C F W') # shape (b, c', f, w') + vy = rearrange(self.motion_emb(rearrange(z_motion_y, 'B D F H -> B F H D')), 'B F H C -> B C F H') # shape (b, c', f, h') + if self.downsample_motion: + vx = self.up_motion(vx) + vy = self.up_motion(vy) + vx = repeat(vx, 'b c f w -> b c f h w', h=z.size(3)) + vy = repeat(vy, 'b c f h -> b c f h w', w=z.size(4)) + + if only_part == 'content': + c_plus_m = vt + elif only_part == 'motion': + c_plus_m = vx + vy + else: + c_plus_m = vt + vx + vy + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B C F H W', F=z_content.size(1)) + z_content, content_reglog = self.regularization(z_content) + z_content = rearrange(z_content, 'B C F H W -> B F H W C') + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + z_motion_x, z_motion_y = self.get_motion_latent(z) + z_motion_x, z_motion_x_log = self.regularization(z_motion_x) + z_motion_y, z_motion_y_log = self.regularization(z_motion_y) + reg_log = {} + reg_log['kl_loss'] = content_reglog['kl_loss'] + z_motion_x_log['kl_loss'] + z_motion_y_log['kl_loss'] + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion_x, z_motion_y + + def get_motion_latent(self, z: torch.Tensor) -> torch.Tensor: + f = z.size(2) + if self.downsample_motion: + z = self.downsample_motion_module(rearrange(z, 'B C F H W -> (B F) C H W')) + z = rearrange(z, '(B F) C H W -> B C F H W', F=f) + ux = torch.mean(z, dim=-2) # shape (b, c', f, w') + uy = torch.mean(z, dim=-1) # shape (b, c', f, h') + zx = self.motion_head(ux) # shape (b, d, f, w') + zy = self.motion_head(uy) # shape (b, d, f, h') + + return zx, zy + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class DepthToSpace(nn.Module): + def __init__(self, block_size): + super().__init__() + self.bs, self.bt = block_size + + def forward(self, x): + B, C, N, H, W = x.size() + x = x.view(B, self.bt, self.bs, self.bs, C // ((self.bs ** 2) * self.bt), N, H, W) # (B, bs, bs, bs, C//bs^2, N, H, W) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3).contiguous() # (B, C//bs^3, N, bs, H, bs, W, bs) + x = x.view(B, C // ((self.bs ** 2) * self.bt), N * self.bt, H * self.bs, W * self.bs) # (B, C//bs^3, N * bs, H * bs, W * bs) + # remove the first frame + if self.bt > 1: + x = x[:, :, 1:, :, :] + else: + x = x + return x + + +from torch.optim.lr_scheduler import _LRScheduler + + +class LinearWarmupScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_steps, total_steps, target_lr, last_epoch=-1): + self.warmup_steps = warmup_steps + self.target_lr = target_lr + self.total_steps = total_steps + super(LinearWarmupScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_steps: + # Linear warm-up + return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs] + elif self.last_epoch < self.total_steps: + # Constant learning rate + return [base_lr * (1 - self.last_epoch / self.total_steps) for base_lr in self.base_lrs] + else: + return self.base_lrs + +class ConstantWarmupScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_steps, total_steps, last_epoch=-1): + self.warmup_steps = warmup_steps + self.total_steps = total_steps + # self.base_lrs = lr_max + super(ConstantWarmupScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_steps: + # Linear warm-up + return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs] + elif self.last_epoch < self.total_steps: + # Constant learning rate + return self.base_lrs + +class LambdaWarmUpCosineScheduler(_LRScheduler): + """ + note: use with a base_lr of 1.0 + """ + def __init__(self, optimizer, lr_min, lr_max, lr_start, total_steps, warmup_rate = -1, verbosity_interval=0, last_epoch=-1, warmup_steps=-1): + self.verbosity_interval = verbosity_interval + if warmup_rate >= 0: + self.lr_warm_up_steps = total_steps * warmup_rate + elif warmup_steps >= 0: + self.lr_warm_up_steps = warmup_steps + else: + self.lr_warm_up_steps = 0 + self.lr_start = lr_start + self.lr_min = lr_min + self.lr_max = lr_max + self.lr_max_decay_steps = total_steps + super(LambdaWarmUpCosineScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.verbosity_interval > 0: + if self.last_epoch % self.verbosity_interval == 0: print(f"current step: {self.last_epoch}, recent lr-multiplier: {self.last_lr}") + if self.last_epoch < self.lr_warm_up_steps: + lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * self.last_epoch + self.lr_start + self.last_lr = lr + return [lr] + else: + t = (self.last_epoch - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) + t = min(t, 1.0) + lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( + 1 + np.cos(t * np.pi)) # a + 0.5 * (b - a) * (1 + cos(pi * t)), where t \in [0, 1], so the lr will be in [a, b] + self.last_lr = lr + return [lr] + + + +def instantiate_lrscheduler_from_config(optimizer, config, name='main-LR'): + """ + Instantiate a learning rate scheduler from a config dict. + If use timm, must add the following codes to the LightningModule: + + def lr_scheduler_step(self, scheduler, metric): + if 'timm.scheduler' in self.lr_scheduler_config.target: + scheduler.step(epoch=self.current_epoch) + else: + if metric is None: + scheduler.step() + else: + scheduler.step(metric) + """ + assert 'target' in config, 'Expected key `target` to instantiate.' + if ('torch.optim' in config.target) or ('timm.scheduler' in config.target): + scheduler = get_obj_from_str(config["target"])(optimizer, **config.get("params", dict())) + lr_scheduler = { + 'scheduler': scheduler, + 'name': name + } + else: + scheduler_init = instantiate_from_config(config) + scheduler = LambdaLR(optimizer, lr_lambda=scheduler_init.schedule) + lr_scheduler = { + 'scheduler': LambdaLR(optimizer, lr_lambda=scheduler_init.schedule), + 'name': name, + 'interval': 'step', + 'frequency': 1 + } + return scheduler + + + diff --git a/Meissonic/VidTok/vidtwin/modules/qformer.py b/Meissonic/VidTok/vidtwin/modules/qformer.py new file mode 100644 index 0000000000000000000000000000000000000000..be08e95b15c2fa1b82a0d761547734834e6bf139 --- /dev/null +++ b/Meissonic/VidTok/vidtwin/modules/qformer.py @@ -0,0 +1,654 @@ +# coding=utf-8 +"""PyTorch BLIP-2 model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from transformers.utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from transformers import Blip2QFormerConfig, Blip2PreTrainedModel + + +logger = logging.get_logger(__name__) + +class Blip2QFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +class Blip2QFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention) + self.output = Blip2QFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class Blip2QFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class Blip2QFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Blip2QFormerModel(Blip2PreTrainedModel): + """ + Querying Transformer (Q-Former), used in BLIP-2. + """ + + def __init__(self, config: Blip2QFormerConfig): + super().__init__(config) + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + query_embeds: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +from einops import repeat + +class MyQformerInterface(nn.Module): + def __init__(self, num_query_tokens=3, query_hidden_size=64, encoder_hidden_size=768, num_hidden_layers=6,intermediate_size=768, num_attention_heads=8): + super().__init__() + self.config = Blip2QFormerConfig(hidden_size=query_hidden_size, encoder_hidden_size=encoder_hidden_size, num_hidden_layers=num_hidden_layers, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads) + self.qformer = Blip2QFormerModel(self.config) + self.query_embeds = nn.Parameter(torch.randn(num_query_tokens, query_hidden_size)) + + def forward(self, encoder_hidden_states): + query_batch = repeat(self.query_embeds, 'q d -> b q d', b=encoder_hidden_states.shape[0]) + output = self.qformer(query_embeds=query_batch, encoder_hidden_states=encoder_hidden_states) + return output.last_hidden_state + + +if __name__ == '__main__': + a_former = MyQformerInterface(10, 768, 768) + print('initialized query embeddings', a_former.query_embeds) + test_encoder_hidden_states = torch.randn(2, 16, 768) * 100 + + for name, param in a_former.named_parameters(): + print(name, param.shape) + optim = torch.optim.Adam(a_former.parameters(), lr=0.01) + for i in range(20): + print('running forward pass', i) + output = a_former(test_encoder_hidden_states) + print('loss', output.sum()) + output.sum().backward() + optim.step() + optim.zero_grad() + + print('query embeddings after 10 forward passes', a_former.query_embeds) + diff --git a/Meissonic/VidTok/vidtwin/modules/st_transformer.py b/Meissonic/VidTok/vidtwin/modules/st_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ae93682b034377a6fa7c36a55f6e714462bab28b --- /dev/null +++ b/Meissonic/VidTok/vidtwin/modules/st_transformer.py @@ -0,0 +1,804 @@ +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from timm.models.layers import DropPath +from timm.models.vision_transformer import Mlp +import torch.nn.functional as F +approx_gelu = lambda: nn.GELU(approximate="tanh") + +from collections.abc import Iterable + +from torch.utils.checkpoint import checkpoint, checkpoint_sequential +from pathlib import Path +from omegaconf import ListConfig +from torch.cuda.amp import autocast + +from einops import rearrange, repeat, reduce, pack, unpack +import pickle + +def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): + assert isinstance(model, nn.Module) + + def set_attr(module): + module.grad_checkpointing = True + module.fp32_attention = use_fp32_attention + module.grad_checkpointing_step = gc_step + + model.apply(set_attr) + + +def auto_grad_checkpoint(module, *args, **kwargs): + if getattr(module, "grad_checkpointing", False): + if not isinstance(module, Iterable): + return checkpoint(module, *args, **kwargs) + gc_step = module[0].grad_checkpointing_step + return checkpoint_sequential(module, gc_step, *args, **kwargs) + return module(*args, **kwargs) + + +def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool): + if use_kernel: + try: + from apex.normalization import FusedLayerNorm + + return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps) + except ImportError: + raise RuntimeError("FusedLayerNorm not available. Please install apex.") + else: + return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine) + + +def t2i_modulate(x, shift, scale): + return x * (1 + scale) + shift + + +class T2IFinalLayer(nn.Module): + """ + The final layer of PixArt. + """ + + def __init__(self, hidden_size, num_patch, out_channels): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) + self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5) + self.out_channels = out_channels + + def forward(self, x): + shift, scale = (self.scale_shift_table[None]).chunk(2, dim=1) + x = t2i_modulate(self.norm_final(x), shift, scale) + x = self.linear(x) + return x + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + enable_flashattn: bool = False, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.enable_flashattn = enable_flashattn + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor, causal: bool) -> torch.Tensor: + B, N, C = x.shape + qkv = self.qkv(x) + qkv_shape = (B, N, 3, self.num_heads, self.head_dim) + if self.enable_flashattn: + qkv_permute_shape = (2, 0, 1, 3, 4) + else: + qkv_permute_shape = (2, 0, 3, 1, 4) + qkv = qkv.view(qkv_shape).permute(qkv_permute_shape) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + if self.enable_flashattn: + from flash_attn import flash_attn_func + + x = flash_attn_func( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + softmax_scale=self.scale, + causal=causal, + ) + else: + # raise NotImplementedError + dtype = q.dtype + q = q * self.scale + attn = q @ k.transpose(-2, -1) # translate attn to float32 + attn = attn.to(torch.float32) + attn = attn.softmax(dim=-1) + attn = attn.to(dtype) # cast back attn to original dtype + attn = self.attn_drop(attn) + x = attn @ v + + x_output_shape = (B, N, C) + if not self.enable_flashattn: + x = x.transpose(1, 2) + x = x.reshape(x_output_shape) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class GroupAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + enable_flashattn: bool = False, + group_size: int = 4, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.enable_flashattn = enable_flashattn + self.group_size = group_size + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor, causal: bool) -> torch.Tensor: + B, N, C = x.shape + assert N % self.group_size == 0, "sequence length should be divisible by group_size" + G = N // self.group_size + if self.enable_flashattn: + qkv_permute_shape = (2, 0, 1, 3, 4) + else: + qkv_permute_shape = (2, 0, 3, 1, 4) + qkv = self.qkv(x).view(B, N, 3, self.num_heads, self.head_dim).permute(qkv_permute_shape) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + + + if self.enable_flashattn: + # reshape to (B, G, 4, H, D) + q = q.view(B * G, self.group_size, self.num_heads, self.head_dim) + k = k.view(B * G, self.group_size, self.num_heads, self.head_dim) + v = v.view(B * G, self.group_size, self.num_heads, self.head_dim) + from flash_attn import flash_attn_func + + # modify flash_attn_func to support the new shape + x = flash_attn_func( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + softmax_scale=self.scale, + causal=causal, + ).reshape(B, N, C) + else: + q = rearrange(q, "B H S D -> (B G) H N D", G=G) + k = rearrange(k, "B H S D -> (B G) H N D", G=G) + v = rearrange(v, "B H S D -> (B G) H N D", G=G) + q = q * self.scale + attn = (q @ k.transpose(-2, -1)).softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v) + x = rearrange(x, "(B G) H N D -> B S (H D)", G=G, S=N) + + x = self.proj(x) + x = self.proj_drop(x) + return x + +class PatchEmbed3D(nn.Module): + """Video to Patch Embedding. + + Args: + patch_size (int): Patch token size. Default: (2,4,4). + in_chans (int): Number of input video channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, + patch_size=(2, 4, 4), + in_chans=3, + embed_dim=96, + norm_layer=None, + flatten=True, + ): + super().__init__() + self.patch_size = patch_size + self.flatten = flatten + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, D, H, W = x.size() + if W % self.patch_size[2] != 0: + x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) + if H % self.patch_size[1] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) + if D % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) + + x = self.proj(x) # (B 768, 16, 14, 14) patchify, for each patch, we use 768 vector to represent it + if self.norm is not None: + D, Wh, Ww = x.size(2), x.size(3), x.size(4) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC + return x + + + +class STBlock(nn.Module): + def __init__( + self, + hidden_size, + num_heads, + d_s=None, + d_t=None, + mlp_ratio=4.0, + drop_path=0.0, + enable_flashattn=True, + enable_layernorm_kernel=False, + temporal_casual=True, + no_temporal=False, + temporal_group = False, + group_size = 1 + # enable_sequence_parallelism=False, + ): + super().__init__() + self.hidden_size = hidden_size + self.enable_flashattn = enable_flashattn + + self.attn_cls = Attention + self.no_temporal = no_temporal + self.attn_group = GroupAttention + self.temporal_group = temporal_group + self.group_size = group_size + + self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) + self.attn = self.attn_cls( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=enable_flashattn, + ) + self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) + self.mlp = Mlp( + in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) + + # temporal attention + self.d_s = d_s + self.d_t = d_t + if self.temporal_group: + self.attn_temp = self.attn_group( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=self.enable_flashattn, + group_size=self.group_size, + ) + else: + self.attn_temp = self.attn_cls( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=self.enable_flashattn, + ) + self.temporal_casual = temporal_casual + + def forward(self, x, tpe=None): + + # B, T, S, C = x.shape[0] + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + ).chunk(6, dim=1) + x = x.to(torch.float64) + x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa).to(torch.float64) + + # spatial branch + x_s = rearrange(x_m, "B T S C -> (B T) S C", T=self.d_t, S=self.d_s) + # print(x_s.dtype) + # x_s = x_s.to(torch.float32) + x_s = x_s.to(torch.bfloat16) + x_s = self.attn(x_s, causal=False,).to(torch.bfloat16) + x_s = rearrange(x_s, "(B T) S C -> B T S C", T=self.d_t, S=self.d_s) + x = x + self.drop_path(gate_msa * x_s) + + if not self.no_temporal: + # temporal branch + x_t = rearrange(x, "B T S C -> (B S) T C", T=self.d_t, S=self.d_s) + + if tpe is not None: + x_t = x_t + tpe + x_t = x_t.to(torch.bfloat16) + x_t = self.attn_temp(x_t, causal=self.temporal_casual,) + x_t = rearrange(x_t, "(B S) T C -> B T S C", T=self.d_t, S=self.d_s).to(torch.bfloat16) + x = x + self.drop_path(gate_msa * x_t) + + # mlp + x = x.to(torch.float32) + x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) + x = x.to(torch.float32) + + return x + + +def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0): + pos = np.arange(0, length)[..., None] / scale + return get_1d_sincos_pos_embed_from_grid(embed_dim, pos) + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if not isinstance(grid_size, tuple): + grid_size = (grid_size, grid_size) + + grid_h = np.arange(grid_size[0], dtype=np.float32) / scale + grid_w = np.arange(grid_size[1], dtype=np.float32) / scale + if base_size is not None: + grid_h *= base_size / grid_size[0] + grid_w *= base_size / grid_size[1] + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token and extra_tokens > 0: + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def exists(v): + return v is not None + +def default(v, d): + return v if exists(v) else d + +def divisible_by(num, den): + return (num % den) == 0 + +def is_odd(n): + return not divisible_by(n, 2) + +def cast_tuple(t, length = 1): + if isinstance(t, ListConfig): + return tuple(t) + return t if isinstance(t, tuple) else ((t,) * length) + +class DepthToSpace(nn.Module): + + def __init__(self, block_size): + super().__init__() + self.bs, self.bt = block_size + + def forward(self, x): + B, C, N, H, W = x.size() + x = x.view(B, self.bt, self.bs, self.bs, C // ((self.bs ** 2) * self.bt), N, H, W) # (B, bs, bs, bs, C//bs^2, N, H, W) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3).contiguous() # (B, C//bs^3, N, bs, H, bs, W, bs) + x = x.view(B, C // ((self.bs ** 2) * self.bt), N * self.bt, H * self.bs, W * self.bs) # (B, C//bs^3, N * bs, H * bs, W * bs) + # remove the first frame + if self.bt > 1: + x = x[:, :, 1:, :, :] + else: + x = x + return x + + +# Swish Function +class Swish(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) + + + +class STTransformer(nn.Module): + def __init__( + self, + input_size=(1, 32, 32), + in_channels=4, + patch_size=(1, 2, 2), + hidden_size=1152, + depth=28, + num_heads=16, + mlp_ratio=4.0, + pred_sigma=False, + drop_path=0.0, + no_temporal_pos_emb=False, + space_scale=1.0, + time_scale=1.0, + freeze=None, + enable_flashattn=False, + enable_layernorm_kernel=False, + temporal_casual=True, + no_temporal=False, + temporal_group=False, + group_size=1, + ): + super().__init__() + self.pred_sigma = pred_sigma + self.in_channels = in_channels + self.out_channels = in_channels * 2 if pred_sigma else in_channels + self.hidden_size = hidden_size + self.patch_size = patch_size + self.input_size = input_size + num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) + self.num_patches = num_patches + self.num_temporal = input_size[0] // patch_size[0] + self.num_spatial = num_patches // self.num_temporal + self.num_heads = num_heads + self.no_temporal_pos_emb = no_temporal_pos_emb + self.depth = depth + self.mlp_ratio = mlp_ratio + self.enable_flashattn = enable_flashattn + self.enable_layernorm_kernel = enable_layernorm_kernel + self.space_scale = space_scale + self.time_scale = time_scale + self.temporal_casual = temporal_casual + self.temporal_group = temporal_group + self.group_size = group_size + + self.register_buffer("pos_embed", self.get_spatial_pos_embed()) + self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) + + self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) + self.no_temporal = no_temporal + + drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] + self.blocks = nn.ModuleList( + [ + STBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=self.mlp_ratio, + drop_path=drop_path[i], + enable_flashattn=self.enable_flashattn, + enable_layernorm_kernel=self.enable_layernorm_kernel, + d_t=self.num_temporal, + d_s=self.num_spatial, + temporal_casual=self.temporal_casual, + no_temporal=self.no_temporal, + temporal_group = self.temporal_group, + group_size = self.group_size + ) + for i in range(self.depth) + ] + ) + self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) + + # init model + self.initialize_weights() + self.initialize_temporal() + if freeze is not None: + assert freeze in ["not_temporal", "text"] + if freeze == "not_temporal": + self.freeze_not_temporal() + elif freeze == "text": + self.freeze_text() + + + + def forward(self, x): + """ + Forward pass of STDiT. + Args: + x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] + + Returns: + x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] + """ + + x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) + x = x + self.pos_embed + + with autocast(enabled=True): + for i, block in enumerate(self.blocks): + if i == 0: + tpe = self.pos_embed_temporal + else: + tpe = None + x = auto_grad_checkpoint(block, x, tpe) + + x = rearrange(x, "B T S C -> B (T S) C", T=self.num_temporal, S=self.num_spatial) + return x + + def unpatchify(self, x): + """ + Args: + x (torch.Tensor): of shape [B, N, C] + + Return: + x (torch.Tensor): of shape [B, C_out, T, H, W] + """ + + N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] + T_p, H_p, W_p = self.patch_size + x = rearrange( + x, + "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", + N_t=N_t, + N_h=N_h, + N_w=N_w, + T_p=T_p, + H_p=H_p, + W_p=W_p, + C_out=self.out_channels, + ) + return x + + def unpatchify_old(self, x): + c = self.out_channels + t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] + pt, ph, pw = self.patch_size + + x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) + x = rearrange(x, "n t h w r p q c -> n c t r h p w q") + imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) + return imgs + + def get_spatial_pos_embed(self, grid_size=None): + if grid_size is None: + grid_size = self.input_size[1:] + pos_embed = get_2d_sincos_pos_embed( + self.hidden_size, + (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), + scale=self.space_scale, + ) + pos_embed = torch.from_numpy(pos_embed).unsqueeze(0).requires_grad_(False) + return pos_embed + + def get_temporal_pos_embed(self): + pos_embed = get_1d_sincos_pos_embed( + self.hidden_size, + self.input_size[0] // self.patch_size[0], + scale=self.time_scale, + ) + pos_embed = torch.from_numpy(pos_embed).unsqueeze(0).requires_grad_(False) + return pos_embed + + def freeze_not_temporal(self): + for n, p in self.named_parameters(): + if "attn_temp" not in n: + p.requires_grad = False + + def freeze_text(self): + for n, p in self.named_parameters(): + if "cross_attn" in n: + p.requires_grad = False + + def initialize_temporal(self): + for block in self.blocks: + nn.init.constant_(block.attn_temp.proj.weight, 0) + nn.init.constant_(block.attn_temp.proj.bias, 0) + + def initialize_weights(self): + # Initialize transformer layers: + def _basic_init(module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + self.apply(_basic_init) + + w = self.x_embedder.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + nn.init.constant_(self.final_layer.linear.weight, 0) + nn.init.constant_(self.final_layer.linear.bias, 0) + +class STTEncoder(STTransformer): + def __init__(self, input_size=(1, 32, 32), in_channels=3, patch_size=(1, 2, 2), hidden_size=64, depth=12, num_heads=8, mlp_ratio=4, pred_sigma=False, drop_path=0, no_temporal_pos_emb=False, space_scale=1, time_scale=1, freeze=None, enable_flashattn=True, enable_layernorm_kernel=False, temporal_casual=True, no_temporal=False, temporal_group=False, group_size=1): + super().__init__(input_size, in_channels, patch_size, hidden_size, depth, num_heads, mlp_ratio, pred_sigma, drop_path, no_temporal_pos_emb, space_scale, time_scale, freeze, enable_flashattn, enable_layernorm_kernel, temporal_casual, no_temporal, temporal_group, group_size) + + def forward(self, x): + x = self.x_embedder(x) + y = super().forward(x) + y = rearrange(y, "B (T H W) C -> B C T H W", T=self.input_size[0], H=self.input_size[1]//self.patch_size[1], W=self.input_size[2]//self.patch_size[2]) + return y + + @property + def device(self): + return self.zero.device + + @classmethod + def init_and_load_from(cls, path, strict = True): + path = Path(path) + assert path.exists() + pkg = torch.load(str(path), map_location = 'cpu') + + assert 'config' in pkg, 'model configs were not found in this saved checkpoint' + + config = pickle.loads(pkg['config']) + tokenizer = cls(**config) + tokenizer.load(path, strict = strict) + return tokenizer + + def save(self, path, overwrite = True): + path = Path(path) + assert overwrite or not path.exists(), f'{str(path)} already exists' + + pkg = dict( + model_state_dict = self.state_dict(), + version =self.__version__, + config = self._configs + ) + + torch.save(pkg, str(path)) + + def load(self, path, strict = True): + path = Path(path) + assert path.exists() + + pkg = torch.load(str(path)) + state_dict = pkg.get('model_state_dict') + version = pkg.get('version') + + assert exists(state_dict) + + if exists(version): + print(f'loading checkpointed tokenizer from version {version}') + + self.load_state_dict(state_dict, strict = strict) + + + @torch.no_grad() + def tokenize(self, video): + self.eval() + return self.forward(video, return_codes = True) + + def debug_model(self, x, layer): + if torch.isnan(x).any(): + print('x has nan') + print(layer) + import sys + sys.exit() + + + +class STTDecoder(STTransformer): + def __init__(self, input_size=(1, 32, 32), in_channels=3, patch_size=(1, 2, 2), hidden_size=1152, depth=12, num_heads=16, mlp_ratio=4, pred_sigma=False, drop_path=0, no_temporal_pos_emb=False, space_scale=1, time_scale=1, freeze=None, enable_flashattn=True, enable_layernorm_kernel=False, temporal_casual=True, no_temporal=False): + super().__init__(input_size, in_channels, patch_size, hidden_size, depth, num_heads, mlp_ratio,pred_sigma, drop_path, no_temporal_pos_emb, space_scale, time_scale, freeze, enable_flashattn, enable_layernorm_kernel, temporal_casual, no_temporal) + self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) + + def forward(self, x): + x = rearrange(x, "B C T H W -> B (T H W) C") + y = super().forward(x) + y = self.final_layer(y) + y = self.unpatchify(y) + return y + + @property + def device(self): + return self.zero.device + + @classmethod + def init_and_load_from(cls, path, strict = True): + path = Path(path) + assert path.exists() + pkg = torch.load(str(path), map_location = 'cpu') + + assert 'config' in pkg, 'model configs were not found in this saved checkpoint' + + config = pickle.loads(pkg['config']) + tokenizer = cls(**config) + tokenizer.load(path, strict = strict) + return tokenizer + + def save(self, path, overwrite = True): + path = Path(path) + assert overwrite or not path.exists(), f'{str(path)} already exists' + + pkg = dict( + model_state_dict = self.state_dict(), + version = self.__version__, + config = self._configs + ) + + torch.save(pkg, str(path)) + + def load(self, path, strict = True): + path = Path(path) + assert path.exists() + + pkg = torch.load(str(path)) + state_dict = pkg.get('model_state_dict') + version = pkg.get('version') + + assert exists(state_dict) + + if exists(version): + print(f'loading checkpointed tokenizer from version {version}') + + self.load_state_dict(state_dict, strict = strict) + + + @torch.no_grad() + def tokenize(self, video): + self.eval() + return self.forward(video, return_codes = True) + + def debug_model(self, x, layer): + if torch.isnan(x).any(): + print('x has nan') + print(layer) + import sys + sys.exit() + + def get_last_layer(self): + return self.final_layer.linear.weight \ No newline at end of file diff --git a/Meissonic/VidTok/vidtwin/scripts/inference_evaluate.py b/Meissonic/VidTok/vidtwin/scripts/inference_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4a7b33dffb2a51c36294124a1ea7079ac70c97 --- /dev/null +++ b/Meissonic/VidTok/vidtwin/scripts/inference_evaluate.py @@ -0,0 +1,208 @@ +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import warnings +warnings.filterwarnings("ignore") + +import time +import numpy as np +import torch +from contextlib import nullcontext +from pathlib import Path + +import decord +from einops import rearrange +from lightning.pytorch import seed_everything +from omegaconf import OmegaConf +from safetensors.torch import load_file as load_safetensors +from torch import autocast +from torchvision import transforms +from tqdm import tqdm + +from vidtok.modules.lpips import LPIPS +from vidtok.data.vidtok import VidTokValDataset +from vidtok.modules.util import instantiate_from_config, print0, compute_psnr, compute_ssim + + +def load_model_from_config(config, ckpt, verbose=False): + config = OmegaConf.load(config) + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Loading model from {ckpt}") + model = instantiate_from_config(config.model) + + if ckpt.endswith("ckpt"): + sd = torch.load(ckpt, map_location="cpu")["state_dict"] + elif ckpt.endswith("safetensors"): + sd = load_safetensors(ckpt) + else: + raise NotImplementedError(f"Unknown checkpoint: {ckpt}") + + new_sd = {} + for k, v in sd.items(): + if k.startswith("loss"): + continue + new_sd[k] = v + missing, unexpected = model.load_state_dict(new_sd, strict=False) + print0( + f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Restored from {ckpt} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + + if len(missing) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Unexpected Keys: {unexpected}") + return model + + +class MultiVideoDataset(VidTokValDataset): + def __init__( + self, + data_dir, + meta_path=None, + input_height=256, + input_width=256, + num_frames_per_batch=17, + sample_fps=30, + ): + super().__init__( + data_dir=data_dir, + meta_path=meta_path, + video_params={ + "input_height": input_height, + "input_width": input_width, + "sample_num_frames": num_frames_per_batch, + "sample_fps": sample_fps, + }, + pre_load_frames=True, + last_frames_handle="repeat", + ) + + def __getitem__(self, idx): + frames = super().__getitem__(idx)["jpg"] + return frames + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--data_dir", + type=str, + default="", + help="root folder", + ) + parser.add_argument( + "--meta_path", + type=str, + default=None, + help="path to the .csv meta file", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=17, + help="number of frames per batch", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset = MultiVideoDataset( + data_dir=args.data_dir, + meta_path=args.meta_path, + input_height=args.input_height, + input_width=args.input_width, + num_frames_per_batch=args.num_frames_per_batch, + sample_fps=args.sample_fps + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + perceptual_loss = LPIPS().eval() + perceptual_loss = perceptual_loss.to(device) + + psnrs, ssims, lpipss = [], [], [] + + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, output, *_ = model(input) + + output = output.clamp(-1, 1) + input, output = map(lambda x: (x + 1) / 2, (input, output)) + + if input.dim() == 5: + input = rearrange(input, "b c t h w -> (b t) c h w") + assert output.dim() == 5 + output = rearrange(output, "b c t h w -> (b t) c h w") + + psnr = compute_psnr(input, output) + ssim = compute_ssim(input, output) + lpips = perceptual_loss(input * 2 - 1, output * 2 - 1).mean() + + psnrs.append(psnr.item()) + ssims.append(ssim.item()) + lpipss.append(lpips.item()) + + toc = time.time() + print0( + f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] PSNR: {np.mean(psnrs):.4f}, SSIM: {np.mean(ssims):.4f}, LPIPS: {np.mean(lpipss):.4f}" + ) + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtwin/scripts/inference_reconstruct.py b/Meissonic/VidTok/vidtwin/scripts/inference_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..e568df36bab66e6e0061aeaee05b2da000afc0ec --- /dev/null +++ b/Meissonic/VidTok/vidtwin/scripts/inference_reconstruct.py @@ -0,0 +1,191 @@ +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import warnings +warnings.filterwarnings("ignore") + +import time +import numpy as np +from contextlib import nullcontext +from pathlib import Path + +import torch +from einops import rearrange +from lightning.pytorch import seed_everything +from torch import autocast +from torchvision.io import write_video +from tqdm import tqdm + +from vidtwin.scripts.inference_evaluate import print0, load_model_from_config, transforms, decord + + +class SingleVideoDataset(torch.utils.data.Dataset): + def __init__(self, video_path, input_height=128, input_width=128, num_frames_per_batch=16, sample_fps=8): + decord.bridge.set_bridge("torch") + self.video_path = video_path + normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + self.transform = transforms.Compose( + [ + transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x : x + num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x : x + num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255.0 + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=17, + help="number of frames per batch", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwininference_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + os.makedirs(args.output_video_dir, exist_ok=True) + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset = SingleVideoDataset( + args.input_video_path, args.input_height, args.input_width, args.num_frames_per_batch, args.sample_fps + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + inputs = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, xrec, *_ = model(input) + input = rearrange(input, "b c t h w -> (b t) c h w") + inputs.append(input) + xrec = rearrange(xrec, "b c t h w -> (b t) c h w") + outputs.append(xrec) + + toc = time.time() + + # save the outputs as videos + inputs = tensor_to_uint8(torch.cat(inputs, dim=0)) + inputs = rearrange(inputs, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs.shape[0], outputs.shape[0]) + final = np.concatenate([inputs[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"{Path(args.input_video_path).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + + print0(f"[bold red]Results saved in: {output_video_path}[/bold red]") + print0(f"[bold red]\[vidtwin.scripts.inference_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py b/Meissonic/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..69a88da7061da0f669fb62c64f0a7cea207700ce --- /dev/null +++ b/Meissonic/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py @@ -0,0 +1,264 @@ +import argparse +import datetime +import glob +import inspect +import os +import re +import sys +import numpy as np +import warnings +warnings.filterwarnings("ignore") +from inspect import Parameter +from typing import Union +from matplotlib import pyplot as plt +from natsort import natsorted +from omegaconf import OmegaConf +from packaging import version +from PIL import Image +from pathlib import Path +from tqdm import tqdm + +import torch +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch import seed_everything +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +import decord +import time +from einops import rearrange +from contextlib import nullcontext +from torch import autocast +from torchvision import transforms +from torchvision.utils import save_image +from torchvision.io import write_video +from safetensors.torch import load_file as load_safetensors + +from vidtok.modules.util import instantiate_from_config, print0 + + +def load_model_from_config(config, ckpt, verbose=False): + config = OmegaConf.load(config) + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Loading model from {ckpt}") + model = instantiate_from_config(config.model) + + if ckpt.endswith("ckpt"): + sd = torch.load(ckpt, map_location="cpu")["state_dict"] + elif ckpt.endswith("safetensors"): + sd = load_safetensors(ckpt) + else: + raise NotImplementedError(f"Unknown checkpoint: {ckpt}") + + missing, unexpected = model.load_state_dict(sd, strict=False) + print0( + f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Restored from {ckpt} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Unexpected Keys: {unexpected}") + return model + + +class VideoDataset(torch.utils.data.Dataset): + def __init__(self, video_path, input_height=128, input_width=128, sample_fps=8, num_frames_per_batch=16): + decord.bridge.set_bridge("torch") + self.video_path = video_path + normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + self.transform = transforms.Compose([transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + normalize,]) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x:x+num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x:x+num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255. + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", + type=str, + help="evaluate at this precision", + choices=["full", "autocast"], + default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/tvae3d/webvid_kl_f_16_128_884_8chn_80G4.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="xxxxx.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path_structure", + type=str, + default="logs/assets/Nik.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_video_path_dynamics", + type=str, + default="logs/assets/Nik.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=4, + help="", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=16, + help="", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + parser.add_argument( + "--dynamics_split", + type=str2bool, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + os.makedirs(args.output_video_dir, exist_ok=True) + print(args.ckpt) + print(args.config) + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset_structure = VideoDataset(args.input_video_path_structure, args.input_height, args.input_width, args.sample_fps, args.num_frames_per_batch) + dataset_dynamics = VideoDataset(args.input_video_path_dynamics, args.input_height, args.input_width, args.sample_fps, args.num_frames_per_batch) + min_len = min(len(dataset_structure), len(dataset_dynamics)) + dataset_structure = torch.utils.data.Subset(dataset_structure, range(min_len)) + dataset_dynamics = torch.utils.data.Subset(dataset_dynamics, range(min_len)) + dataloader_structure = torch.utils.data.DataLoader(dataset_structure, batch_size=1, shuffle=False) + dataloader_dynamics = torch.utils.data.DataLoader(dataset_dynamics, batch_size=1, shuffle=False) + + inputs_structure = [] + inputs_dynamics = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input_structure, input_dynamics in zip(tqdm(range(min_len)), dataloader_structure, dataloader_dynamics): + if input_structure.shape[2] <= 5: + continue + input_structure = input_structure.to(device) + input_dynamics = input_dynamics.to(device) + if args.dynamics_split: + z, z_structure, *_ = model.encode(input_structure) + _, _, z_dynamics_x, z_dynamics_y = model.encode(input_dynamics) + xrec = model.decode(z, z_structure, z_dynamics_x, z_dynamics_y) + else: + z, z_structure, *_ = model.encode(input_structure) + _, _, z_dynamics = model.encode(input_dynamics) + xrec = model.decode(z, z_structure, z_dynamics) + input_structure = rearrange(input_structure, "b c t h w -> (b t) c h w") + inputs_structure.append(input_structure) + input_dynamics = rearrange(input_dynamics, "b c t h w -> (b t) c h w") + inputs_dynamics.append(input_dynamics) + xrec = rearrange(xrec, "b c t h w -> (b t) c h w") + outputs.append(xrec) + toc = time.time() + + # save the outputs as videos + inputs_structure = tensor_to_uint8(torch.cat(inputs_structure, dim=0)) + inputs_structure = rearrange(inputs_structure, "t c h w -> t h w c") + inputs_dynamics = tensor_to_uint8(torch.cat(inputs_dynamics, dim=0)) + inputs_dynamics = rearrange(inputs_dynamics, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs_structure.shape[0],inputs_dynamics.shape[0], outputs.shape[0]) + final = np.concatenate([inputs_structure[:min_len], inputs_dynamics[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"structure_{Path(args.input_video_path_structure).stem}_dynamics_{Path(args.input_video_path_dynamics).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Saved the reconstructed video to {output_video_path}") + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + +if __name__ == "__main__": + main() diff --git a/Meissonic/app.py b/Meissonic/app.py new file mode 100644 index 0000000000000000000000000000000000000000..405c7a647f69284d37a28aa20d43c65874dc446c --- /dev/null +++ b/Meissonic/app.py @@ -0,0 +1,149 @@ +import os +import sys +sys.path.append("./") + +import torch +from torchvision import transforms +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel +import gradio as gr + + +device = 'cuda' if torch.cuda.is_available() else 'cpu' + +model_path = "MeissonFlow/Meissonic" +model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer") +vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae") +# text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path, subfolder="text_encoder") +text_encoder = CLIPTextModelWithProjection.from_pretrained( #more stable sampling for some cases + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" + ) +tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer") +scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler") +pipe = Pipeline(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler) +pipe.to(device) + +MAX_SEED = 2**32 - 1 +MAX_IMAGE_SIZE = 1024 + + +def generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)): + if randomize_seed or seed == 0: + seed = torch.randint(0, MAX_SEED, (1,)).item() + torch.manual_seed(seed) + + image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + height=height, + width=width, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps + ).images[0] + + return image, seed + +# Default negative prompt +default_negative_prompt = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" +css = """ +#col-container { + margin: 0 auto; + max-width: 640px; +} +""" + +examples = [ + "Modern Architecture render with pleasing aesthetics.", + "An image of a Pikachu wearing a birthday hat and playing guitar.", + "A statue of a lion stands in front of a building.", + "A white and blue coffee mug with a picture of a man on it.", + "A metal sculpture of a deer with antlers.", + "A bronze statue of an owl with its wings spread.", + "A white table with a vase of flowers and a cup of coffee on top of it.", + "A woman stands on a dock in the fog.", + "A lion's head is shown in a grayscale image.", + "A sculpture of a Greek woman head with a headband and a head of hair." +] + +with gr.Blocks(css=css) as demo: + with gr.Column(elem_id="col-container"): + gr.Markdown("# Meissonic Text-to-Image Generator") + with gr.Row(): + prompt = gr.Text( + label="Prompt", + show_label=False, + max_lines=1, + placeholder="Enter your prompt", + container=False, + ) + run_button = gr.Button("Run", scale=0, variant="primary") + result = gr.Image(label="Result", show_label=False) + with gr.Accordion("Advanced Settings", open=False): + negative_prompt = gr.Text( + label="Negative prompt", + max_lines=1, + placeholder="Enter a negative prompt", + value=default_negative_prompt, + ) + seed = gr.Slider( + label="Seed", + minimum=0, + maximum=MAX_SEED, + step=1, + value=0, + ) + randomize_seed = gr.Checkbox(label="Randomize seed", value=True) + with gr.Row(): + width = gr.Slider( + label="Width", + minimum=256, + maximum=MAX_IMAGE_SIZE, + step=32, + value=1024, + ) + height = gr.Slider( + label="Height", + minimum=256, + maximum=MAX_IMAGE_SIZE, + step=32, + value=1024, + ) + with gr.Row(): + guidance_scale = gr.Slider( + label="Guidance scale", + minimum=0.0, + maximum=20.0, + step=0.1, + value=9.0, + ) + num_inference_steps = gr.Slider( + label="Number of inference steps", + minimum=1, + maximum=100, + step=1, + value=64, + ) + gr.Examples(examples=examples, inputs=[prompt]) + gr.on( + triggers=[run_button.click, prompt.submit], + fn=generate_image, + inputs=[ + prompt, + negative_prompt, + seed, + randomize_seed, + width, + height, + guidance_scale, + num_inference_steps, + ], + outputs=[result, seed], + ) + +demo.launch() \ No newline at end of file diff --git a/Meissonic/app_Monetico.py b/Meissonic/app_Monetico.py new file mode 100644 index 0000000000000000000000000000000000000000..f406a996c094c368f5cacc382be490aba04b9ca8 --- /dev/null +++ b/Meissonic/app_Monetico.py @@ -0,0 +1,151 @@ +import os +import sys +sys.path.append("./") + +import torch +from torchvision import transforms +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel +import gradio as gr +import spaces + +device = 'cuda' if torch.cuda.is_available() else 'cpu' +dtype = torch.bfloat16 + +model_path = "Collov-Labs/Monetico" + +model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype) +vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", torch_dtype=dtype) +text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=dtype) # better for Monetico +# text_encoder = CLIPTextModelWithProjection.from_pretrained( #more stable sampling for some cases +# "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=dtype +# ) +tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer", torch_dtype=dtype) +scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler", torch_dtype=dtype) +pipe = Pipeline(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler) +pipe.to(device) + +MAX_SEED = 2**32 - 1 +MAX_IMAGE_SIZE = 512 + +@spaces.GPU +def generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)): + if randomize_seed or seed == 0: + seed = torch.randint(0, MAX_SEED, (1,)).item() + torch.manual_seed(seed) + + image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + height=height, + width=width, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps + ).images[0] + + return image, seed + +# Default negative prompt +default_negative_prompt = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" +css = """ +#col-container { + margin: 0 auto; + max-width: 640px; +} +""" + +examples = [ + "Modern Architecture render with pleasing aesthetics.", + "An image of a Pikachu wearing a birthday hat and playing guitar.", + "A statue of a lion stands in front of a building.", + "A white and blue coffee mug with a picture of a man on it.", + "A metal sculpture of a deer with antlers.", + "A bronze statue of an owl with its wings spread.", + "A white table with a vase of flowers and a cup of coffee on top of it.", + "A woman stands on a dock in the fog.", + "A lion's head is shown in a grayscale image.", + "A sculpture of a Greek woman head with a headband and a head of hair." +] + +with gr.Blocks(css=css) as demo: + with gr.Column(elem_id="col-container"): + gr.Markdown("# Monetico Text-to-Image Generator") + with gr.Row(): + prompt = gr.Text( + label="Prompt", + show_label=False, + max_lines=1, + placeholder="Enter your prompt", + container=False, + ) + run_button = gr.Button("Run", scale=0, variant="primary") + result = gr.Image(label="Result", show_label=False) + with gr.Accordion("Advanced Settings", open=False): + negative_prompt = gr.Text( + label="Negative prompt", + max_lines=1, + placeholder="Enter a negative prompt", + value=default_negative_prompt, + ) + seed = gr.Slider( + label="Seed", + minimum=0, + maximum=MAX_SEED, + step=1, + value=0, + ) + randomize_seed = gr.Checkbox(label="Randomize seed", value=True) + with gr.Row(): + width = gr.Slider( + label="Width", + minimum=256, + maximum=MAX_IMAGE_SIZE, + step=32, + value=512, + ) + height = gr.Slider( + label="Height", + minimum=256, + maximum=MAX_IMAGE_SIZE, + step=32, + value=512, + ) + with gr.Row(): + guidance_scale = gr.Slider( + label="Guidance scale", + minimum=0.0, + maximum=20.0, + step=0.1, + value=9.0, + ) + num_inference_steps = gr.Slider( + label="Number of inference steps", + minimum=1, + maximum=100, + step=1, + value=48, + ) + gr.Examples(examples=examples, inputs=[prompt]) + gr.on( + triggers=[run_button.click, prompt.submit], + fn=generate_image, + inputs=[ + prompt, + negative_prompt, + seed, + randomize_seed, + width, + height, + guidance_scale, + num_inference_steps, + ], + outputs=[result, seed], + ) + +demo.launch() \ No newline at end of file diff --git a/Meissonic/app_fp8.py b/Meissonic/app_fp8.py new file mode 100644 index 0000000000000000000000000000000000000000..b9f06fa08e1563d531670144742ad5da1ceccc1f --- /dev/null +++ b/Meissonic/app_fp8.py @@ -0,0 +1,223 @@ +import os +import sys +sys.path.append("./") + +import torch +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel +import gradio as gr +import time +from torchao.quantization.quant_api import ( + quantize_, + float8_weight_only, +) + +device = 'cuda' + +def get_quantization_method(method): + quantization_methods = { + 'fp8': lambda: float8_weight_only(), + 'none': None + } + return quantization_methods.get(method, None) + +def load_models(quantization_method='none'): + model_path = "MeissonFlow/Meissonic" + dtype = torch.float16 + model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype) + vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", torch_dtype=dtype) + text_encoder = CLIPTextModelWithProjection.from_pretrained( + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", + torch_dtype=dtype + ) + tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer") + scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler") + + if quantization_method != 'none': + quant_method = get_quantization_method(quantization_method) + if quant_method: + quantize_(model, quant_method()) + + pipe = Pipeline(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler) + return pipe.to(device) + +# Global variable to store the pipeline +global_pipe = None +current_quantization = 'none' + +def initialize_pipeline(quantization): + global global_pipe, current_quantization + if global_pipe is None or current_quantization != quantization: + global_pipe = load_models(quantization) + current_quantization = quantization + return global_pipe + +def generate_images(prompt, negative_prompt, seed, randomize_seed, width, height, + guidance_scale, num_inference_steps, quantization_method, batch_size=1, + progress=gr.Progress(track_tqdm=True)): + if randomize_seed or seed == 0: + seed = torch.randint(0, MAX_SEED, (1,)).item() + torch.manual_seed(seed) + + # Initialize or update pipeline if needed + pipe = initialize_pipeline(quantization_method) + + # Reset CUDA memory stats + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + + # Handle batch generation + if isinstance(prompt, str): + prompts = [prompt] * batch_size + else: + prompts = prompt[:batch_size] + + images = pipe( + prompt=prompts, + negative_prompt=[negative_prompt] * batch_size, + height=height, + width=width, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps + ).images + + # Calculate performance metrics + inference_time = time.time() - start_time + memory_used = torch.cuda.max_memory_reserved() / (1024 ** 3) # Convert to GB + + performance_info = f""" + Inference Time: {inference_time:.2f} seconds + Memory Used: {memory_used:.2f} GB + Quantization: {quantization_method} + """ + + return images[0] if batch_size == 1 else images, seed, performance_info + +MAX_SEED = 2**32 - 1 +MAX_IMAGE_SIZE = 1024 +default_negative_prompt = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" + +examples = [ + "Two actors are posing for a pictur with one wearing a black and white face paint.", + "A large body of water with a rock in the middle and mountains in the background.", + "A white and blue coffee mug with a picture of a man on it.", + "The sun is setting over a city skyline with a river in the foreground.", + "A black and white cat with blue eyes.", + "Three boats in the ocean with a rainbow in the sky.", + "A robot playing the piano.", + "A cat wearing a hat.", + "A dog in a jungle." +] + +css = """ +#col-container { + margin: 0 auto; + max-width: 640px; +} +""" + +with gr.Blocks(css=css) as demo: + with gr.Column(elem_id="col-container"): + gr.Markdown("# Meissonic Text-to-Image Generator (with FP8 Support)") + + with gr.Row(): + prompt = gr.Text( + label="Prompt", + show_label=False, + max_lines=1, + placeholder="Enter your prompt", + container=False, + ) + run_button = gr.Button("Run", scale=0, variant="primary") + + result = gr.Image(label="Result", show_label=False) + performance_info = gr.Textbox(label="Performance Metrics", lines=4) + + with gr.Accordion("Advanced Settings", open=False): + quantization = gr.Radio( + choices=['none', 'fp8'], + value='none', + label="Quantization Method", + ) + negative_prompt = gr.Text( + label="Negative prompt", + max_lines=1, + placeholder="Enter a negative prompt", + value=default_negative_prompt, + ) + seed = gr.Slider( + label="Seed", + minimum=0, + maximum=MAX_SEED, + step=1, + value=0, + ) + randomize_seed = gr.Checkbox(label="Randomize seed", value=True) + + with gr.Row(): + width = gr.Slider( + label="Width", + minimum=256, + maximum=MAX_IMAGE_SIZE, + step=32, + value=1024, + ) + height = gr.Slider( + label="Height", + minimum=256, + maximum=MAX_IMAGE_SIZE, + step=32, + value=1024, + ) + + with gr.Row(): + guidance_scale = gr.Slider( + label="Guidance scale", + minimum=0.0, + maximum=20.0, + step=0.1, + value=9.0, + ) + num_inference_steps = gr.Slider( + label="Number of inference steps", + minimum=1, + maximum=100, + step=1, + value=64, + ) + + batch_size = gr.Slider( + label="Batch Size", + minimum=1, + maximum=8, + step=1, + value=1, + ) + + gr.Examples(examples=examples, inputs=[prompt]) + + gr.on( + triggers=[run_button.click, prompt.submit], + fn=generate_images, + inputs=[ + prompt, + negative_prompt, + seed, + randomize_seed, + width, + height, + guidance_scale, + num_inference_steps, + quantization, + batch_size, + ], + outputs=[result, seed, performance_info], + ) + +demo.launch() diff --git a/Meissonic/assets/architecture.png b/Meissonic/assets/architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..5cc0865dedd936ed3a3453637ce1a8b30608ba82 --- /dev/null +++ b/Meissonic/assets/architecture.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:174e021d396802a14e454914586ca45d19a31581bd8e6e98c0252eb1c8f4b1c3 +size 327943 diff --git a/Meissonic/assets/demos.pdf b/Meissonic/assets/demos.pdf new file mode 100644 index 0000000000000000000000000000000000000000..794dbe526fc6cfc843a77c9fb581119c1f602429 --- /dev/null +++ b/Meissonic/assets/demos.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d14191e0b8e9fdf4cb3a7199cf36554e60e456cdeba11509d305a8201e6b131 +size 2476203 diff --git a/Meissonic/assets/demos.png b/Meissonic/assets/demos.png new file mode 100644 index 0000000000000000000000000000000000000000..28073db0290f0e1a8c40c88c0249773a11fcde92 --- /dev/null +++ b/Meissonic/assets/demos.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79322f0c5ba7093d2d5e2274f6d52e257d0063573eab49b19243aefbed63dd5e +size 1828570 diff --git a/Meissonic/assets/inpaint/0eKR4M2uuL8.jpg b/Meissonic/assets/inpaint/0eKR4M2uuL8.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4c6a908dc01284ed3e3ce1a785762f3b8144fae4 --- /dev/null +++ b/Meissonic/assets/inpaint/0eKR4M2uuL8.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ee6f2d8fae720821257db75cc819919a780f644f7a7a3e83aab0bdaccb13d53 +size 1088383 diff --git a/Meissonic/assets/inpaint/0eKR4M2uuL8.png b/Meissonic/assets/inpaint/0eKR4M2uuL8.png new file mode 100644 index 0000000000000000000000000000000000000000..0a44ef73244a5c7f3bdb69a1b42eee6caa922431 Binary files /dev/null and b/Meissonic/assets/inpaint/0eKR4M2uuL8.png differ diff --git a/Meissonic/assets/inpaint/_Rh_zxIUWXA.jpg b/Meissonic/assets/inpaint/_Rh_zxIUWXA.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2a556e92e7deeb04335b7b43b0f5af2f44c30c7c Binary files /dev/null and b/Meissonic/assets/inpaint/_Rh_zxIUWXA.jpg differ diff --git a/Meissonic/assets/inpaint/_Rh_zxIUWXA.png b/Meissonic/assets/inpaint/_Rh_zxIUWXA.png new file mode 100644 index 0000000000000000000000000000000000000000..32011f9ec1c7f06020a82263a751a4faf76cac02 Binary files /dev/null and b/Meissonic/assets/inpaint/_Rh_zxIUWXA.png differ diff --git a/Meissonic/assets/inpaint/__Owak0IgJk.jpg b/Meissonic/assets/inpaint/__Owak0IgJk.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1bd6703272a5ddb1c5614e7c2d440060611a927f --- /dev/null +++ b/Meissonic/assets/inpaint/__Owak0IgJk.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a08fa5cd031f119cb3123d767f2358b5081d6bc7dc1b017081f66bb824e86ae0 +size 109431 diff --git a/Meissonic/assets/inpaint/__Owak0IgJk.png b/Meissonic/assets/inpaint/__Owak0IgJk.png new file mode 100644 index 0000000000000000000000000000000000000000..4a1185857570efc99b201e5947420faaee7a7a43 Binary files /dev/null and b/Meissonic/assets/inpaint/__Owak0IgJk.png differ diff --git a/Meissonic/assets/inpaint/cases.json b/Meissonic/assets/inpaint/cases.json new file mode 100644 index 0000000000000000000000000000000000000000..398136324425e59ae9eabc018e147512c31dcf93 --- /dev/null +++ b/Meissonic/assets/inpaint/cases.json @@ -0,0 +1,20 @@ +[ + { + "input":"./assets/inpaint/_Rh_zxIUWXA.jpg", + "mask": "./assets/inpaint/_Rh_zxIUWXA.png", + "prompt": "A woman with short hair wears a silver gas mask.", + "negative_prompts": null + }, + { + "input":"./assets/inpaint/0eKR4M2uuL8.jpg", + "mask": "./assets/inpaint/0eKR4M2uuL8.png", + "prompt": "A stylish dog wearing sunglasses.", + "negative_prompts": null + }, + { + "input":"./assets/inpaint/__Owak0IgJk.jpg", + "mask": "./assets/inpaint/__Owak0IgJk.png", + "prompt": "A woman wearing a white suspender skirt is sitting", + "negative_prompts": null + } +] \ No newline at end of file diff --git a/Meissonic/assets/outpaint/__G2yFuW7jQ.jpg b/Meissonic/assets/outpaint/__G2yFuW7jQ.jpg new file mode 100644 index 0000000000000000000000000000000000000000..58152afe80a86ff90ac12b443d05d49db4978369 --- /dev/null +++ b/Meissonic/assets/outpaint/__G2yFuW7jQ.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e98e81fa367ade7edd41cd60769d04ab6a7593dea3b3ec35f3f0e84634669de +size 120864 diff --git a/Meissonic/assets/outpaint/__G2yFuW7jQ.png b/Meissonic/assets/outpaint/__G2yFuW7jQ.png new file mode 100644 index 0000000000000000000000000000000000000000..df01f550cd0bdce62dba537055e2fe664297ec15 Binary files /dev/null and b/Meissonic/assets/outpaint/__G2yFuW7jQ.png differ diff --git a/Meissonic/assets/outpaint/cases.json b/Meissonic/assets/outpaint/cases.json new file mode 100644 index 0000000000000000000000000000000000000000..a50b7c7adcf635d3b6c35ef2cb7be43eab84cd46 --- /dev/null +++ b/Meissonic/assets/outpaint/cases.json @@ -0,0 +1,20 @@ +[ + { + "input":"./assets/outpaint/__G2yFuW7jQ.jpg", + "mask": "./assets/outpaint/__G2yFuW7jQ.png", + "prompt": "fall mountains", + "negative_prompts": "The artwork avoids the pitfalls of bad art, such as ugly and deformed eyes and faces, poorly drawn, blurry, and disfigured bodies with extra limbs and close-ups that look weird. It also avoids other common issues such as watermarking, text errors, missing fingers or digits, cropping, poor quality, and JPEG artifacts. The artwork is free of signature or watermark and avoids framing issues.The hands are not deformed, the eyes are not disfigured, and there areno extra bodies or limbs. The artwork is not blurry, out of focus, or poorly drawn, and the proportions are not bad or deformed. There are no mutations, missing limbs, or floating or disconnected limbs. The hands and neck are not malformed, and there are no extra heads or out-of-frame elements. The artwork is not low-res or disgusting and is a well-drawn, highly detailed, and beautiful rendering." + }, + { + "input":"./assets/outpaint/__G2yFuW7jQ.jpg", + "mask": "./assets/outpaint/__G2yFuW7jQ.png", + "prompt": "Rocket launch site", + "negative_prompts": "The artwork avoids the pitfalls of bad art, such as ugly and deformed eyes and faces, poorly drawn, blurry, and disfigured bodies with extra limbs and close-ups that look weird. It also avoids other common issues such as watermarking, text errors, missing fingers or digits, cropping, poor quality, and JPEG artifacts. The artwork is free of signature or watermark and avoids framing issues.The hands are not deformed, the eyes are not disfigured, and there areno extra bodies or limbs. The artwork is not blurry, out of focus, or poorly drawn, and the proportions are not bad or deformed. There are no mutations, missing limbs, or floating or disconnected limbs. The hands and neck are not malformed, and there are no extra heads or out-of-frame elements. The artwork is not low-res or disgusting and is a well-drawn, highly detailed, and beautiful rendering." + }, + { + "input":"./assets/outpaint/__G2yFuW7jQ.jpg", + "mask": "./assets/outpaint/__G2yFuW7jQ.png", + "prompt": "Volcano", + "negative_prompts": "The artwork avoids the pitfalls of bad art, such as ugly and deformed eyes and faces, poorly drawn, blurry, and disfigured bodies with extra limbs and close-ups that look weird. It also avoids other common issues such as watermarking, text errors, missing fingers or digits, cropping, poor quality, and JPEG artifacts. The artwork is free of signature or watermark and avoids framing issues.The hands are not deformed, the eyes are not disfigured, and there areno extra bodies or limbs. The artwork is not blurry, out of focus, or poorly drawn, and the proportions are not bad or deformed. There are no mutations, missing limbs, or floating or disconnected limbs. The hands and neck are not malformed, and there are no extra heads or out-of-frame elements. The artwork is not low-res or disgusting and is a well-drawn, highly detailed, and beautiful rendering." + } +] \ No newline at end of file diff --git a/Meissonic/cog.yaml b/Meissonic/cog.yaml new file mode 100644 index 0000000000000000000000000000000000000000..62a932f5e559b60da72737c697eaafdd5521d7cd --- /dev/null +++ b/Meissonic/cog.yaml @@ -0,0 +1,29 @@ +# Configuration for Cog ⚙️ +# Reference: https://cog.run/yaml + +build: + # set to true if your model requires a GPU + gpu: true + + # a list of ubuntu apt packages to install + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.11" + + # a list of packages in the format == + python_packages: + - torch + - torchvision + - git+https://github.com/huggingface/diffusers.git + - accelerate + - transformers + + # commands run after the environment is setup + run: + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/Meissonic/cosmos_test_output/comparison_grid_video_0.png b/Meissonic/cosmos_test_output/comparison_grid_video_0.png new file mode 100644 index 0000000000000000000000000000000000000000..b4a67c150ac34219cba77f43b57b30d25c6a26b0 --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_grid_video_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73ee850b107ae6842e8c8a528689bfcb930ad7a5d863a9c7f1e2245f82650fa2 +size 394999 diff --git a/Meissonic/cosmos_test_output/comparison_grid_video_1.png b/Meissonic/cosmos_test_output/comparison_grid_video_1.png new file mode 100644 index 0000000000000000000000000000000000000000..363ded074c9441f39fba859f3bd51cccbfecc6da --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_grid_video_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80853c45d1431ab2dee94b554c6d9e64f2d17ff1257aa72f560cf3040ea39f27 +size 8286293 diff --git a/Meissonic/cosmos_test_output/comparison_grid_video_2.png b/Meissonic/cosmos_test_output/comparison_grid_video_2.png new file mode 100644 index 0000000000000000000000000000000000000000..9ccf9964a5b144c5de2f9e736062e302be36526c --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_grid_video_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a285e5661e7100963aada2ca27ead641867d989f6ffaa37f7c1f5d873adfbd +size 9085956 diff --git a/Meissonic/cosmos_test_output/comparison_grid_video_3.png b/Meissonic/cosmos_test_output/comparison_grid_video_3.png new file mode 100644 index 0000000000000000000000000000000000000000..f6c8809b638f2a84a7f05be19c136653958f8fb6 --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_grid_video_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e56e9d5e0cbfbf740d1c53c2d105616e3dc7781e1c8945d712b5edc583ad715f +size 8634545 diff --git a/Meissonic/cosmos_test_output/comparison_video_0.mp4 b/Meissonic/cosmos_test_output/comparison_video_0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e899108ec77f5f3cf6d22bf1e6d9ccf63564a281 Binary files /dev/null and b/Meissonic/cosmos_test_output/comparison_video_0.mp4 differ diff --git a/Meissonic/cosmos_test_output/comparison_video_1.mp4 b/Meissonic/cosmos_test_output/comparison_video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9f5bf55a28c9849abdbd0794d22ef2df7d418f48 --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7311b27e36333219d20c8d835432ecadf9ebe5977bcf760bc6706a85a95cabd +size 1089113 diff --git a/Meissonic/cosmos_test_output/comparison_video_2.mp4 b/Meissonic/cosmos_test_output/comparison_video_2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e9a86b65067e996f43a9b6a994e27b62a92cff24 --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_video_2.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e02445cac3531ab68bda4ba1bc90ac570a7b423f78b9493471acb4d6e5f9a28 +size 1618316 diff --git a/Meissonic/cosmos_test_output/comparison_video_3.mp4 b/Meissonic/cosmos_test_output/comparison_video_3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..34b9bf25d163f76d849fbe42ef41262a8bd9acfc --- /dev/null +++ b/Meissonic/cosmos_test_output/comparison_video_3.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017fcf1133dc553228724625c5ad6ec7f58f97ddc27c91201aa88a07423a76e2 +size 931953 diff --git a/Meissonic/cosmos_test_output/metrics_video_0.txt b/Meissonic/cosmos_test_output/metrics_video_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..671a60175ef523c994215a0935b89b04eba172b5 --- /dev/null +++ b/Meissonic/cosmos_test_output/metrics_video_0.txt @@ -0,0 +1,12 @@ +Video Index: 0 +Video Path: 000/000/000/0.mp4 +Caption: In the video, a man is seen in a living room setting, standing in front of a window with blinds. He is wearing a black sweater and appears to be in the middle of a conversation. The room is dimly lit, with a lamp providing a soft glow in the background. The man's expression is serious, suggesting that the conversation is of importance. The overall style of the video is realistic and naturalistic, capturing a candid moment in the man's life. + +=== Metrics === +Average PSNR: 27.54 dB +Average MSE: 0.001764 +Average SSIM: 0.9779 + +Per-frame PSNR: [26.747089385986328, 27.265975952148438, 27.32347297668457, 27.352922439575195, 27.334339141845703, 27.782726287841797, 27.661243438720703, 27.803525924682617, 27.705425262451172, 27.679603576660156, 27.297304153442383, 27.51146125793457, 27.4649658203125, 27.89719581604004, 27.753822326660156, 27.86109161376953, 27.80060577392578] +Per-frame MSE: [0.002114907605573535, 0.0018767336150631309, 0.0018520501907914877, 0.001839534263126552, 0.001847422681748867, 0.001666200696490705, 0.0017134671797975898, 0.0016582406824454665, 0.0016961240908131003, 0.0017062382539734244, 0.0018632437568157911, 0.001773593365214765, 0.001792682334780693, 0.0016228572931140661, 0.0016773275565356016, 0.0016364054754376411, 0.0016593559412285686] +Per-frame SSIM: [0.9738484025001526, 0.9765720963478088, 0.9768512845039368, 0.9769563674926758, 0.9766926765441895, 0.9789631962776184, 0.9783727526664734, 0.9790931344032288, 0.9786423444747925, 0.9784184694290161, 0.9765286445617676, 0.9777430295944214, 0.9775411486625671, 0.9799171090126038, 0.9794126749038696, 0.9798620343208313, 0.9795750379562378] diff --git a/Meissonic/cosmos_test_output/metrics_video_1.txt b/Meissonic/cosmos_test_output/metrics_video_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..4222a4c6b10cff29079574967ed3908eebca46ee --- /dev/null +++ b/Meissonic/cosmos_test_output/metrics_video_1.txt @@ -0,0 +1,12 @@ +Video Index: 1 +Video Path: 000/000/001/1.mp4 +Caption: The video shows a man standing next to a purple van with a floral design on the side. The man is wearing a black t-shirt and jeans, and he is smiling and waving his hands in the air. The van has pink rims and a black roof rack. The van is parked in front of a building with a glass door. The man appears to be happy and excited about the van. The video is likely a short clip of a man showing off his van. + +=== Metrics === +Average PSNR: 25.14 dB +Average MSE: 0.003232 +Average SSIM: 0.9700 + +Per-frame PSNR: [29.570905685424805, 25.845619201660156, 24.151002883911133, 24.53882598876953, 26.607555389404297, 23.609159469604492, 23.5848445892334, 24.532224655151367, 26.290340423583984, 23.606443405151367, 23.633737564086914, 24.562894821166992, 26.255611419677734, 24.259323120117188, 25.643463134765625, 25.491649627685547] +Per-frame MSE: [0.0011038482189178467, 0.002602784661576152, 0.003845029277727008, 0.0035165559966117144, 0.0021839593537151814, 0.004355963785201311, 0.004380417056381702, 0.003521904582157731, 0.0023494488559663296, 0.004358689300715923, 0.004331381060183048, 0.0034971192944794893, 0.0023683111649006605, 0.0037503137718886137, 0.0027268033009022474, 0.002823807764798403] +Per-frame SSIM: [0.9893906712532043, 0.974087119102478, 0.9622812867164612, 0.9658014178276062, 0.9791845083236694, 0.957465648651123, 0.957618772983551, 0.9664595127105713, 0.977811872959137, 0.9590921998023987, 0.960818350315094, 0.9692971110343933, 0.9799112677574158, 0.9681466817855835, 0.9765862822532654, 0.9759609699249268] diff --git a/Meissonic/cosmos_test_output/metrics_video_2.txt b/Meissonic/cosmos_test_output/metrics_video_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecafb94fe13e5e9bd2a5a3c5cd57c9512b6d533f --- /dev/null +++ b/Meissonic/cosmos_test_output/metrics_video_2.txt @@ -0,0 +1,12 @@ +Video Index: 2 +Video Path: 000/000/002/2.mp4 +Caption: The video is a news segment featuring a man in a red baseball cap and a blue vest, standing in front of a statue of a soldier and two children. The man appears to be a veteran, as indicated by the cap and the context of the event. The event is an honorary ceremony for lost submarines and submarine veterans, taking place near the World Peace Bell in Newport. The news segment is titled "Connected to the Community" and is scheduled to air at 11:10 PM on ABC 9. The style of the video is informative and respectful, focusing on the man and the event, with a clear and concise presentation of the details. + +=== Metrics === +Average PSNR: 22.09 dB +Average MSE: 0.006399 +Average SSIM: 0.9607 + +Per-frame PSNR: [24.496965408325195, 22.367679595947266, 22.21709442138672, 22.679195404052734, 23.883594512939453, 22.220516204833984, 22.20623207092285, 21.4675350189209, 22.316797256469727, 19.425098419189453, 21.102333068847656, 21.321147918701172, 23.025981903076172, 21.053565979003906, 21.95743179321289, 21.684494018554688] +Per-frame MSE: [0.0035506151616573334, 0.005797383841127157, 0.006001925095915794, 0.005396105814725161, 0.0040892185643315315, 0.00599720049649477, 0.006016954779624939, 0.007132581900805235, 0.005865707993507385, 0.011415375396609306, 0.007758304942399263, 0.007377093657851219, 0.00498197739943862, 0.007845907472074032, 0.006371723022311926, 0.0067850141786038876] +Per-frame SSIM: [0.9770643711090088, 0.9623730778694153, 0.9605475068092346, 0.9618802070617676, 0.9713600277900696, 0.9565339088439941, 0.9568989872932434, 0.9560506939888, 0.9673117399215698, 0.9364117383956909, 0.9567262530326843, 0.9589394927024841, 0.9706904888153076, 0.9546973705291748, 0.9623250365257263, 0.9610125422477722] diff --git a/Meissonic/cosmos_test_output/metrics_video_3.txt b/Meissonic/cosmos_test_output/metrics_video_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..541d12165559e95bc4d39145fcb59be4c5b948d3 --- /dev/null +++ b/Meissonic/cosmos_test_output/metrics_video_3.txt @@ -0,0 +1,12 @@ +Video Index: 3 +Video Path: 000/000/003/3.mp4 +Caption: The video features a man in a pink shirt and a black bucket hat, wearing glasses and a necklace. He is holding a spoon and making a playful face, as if he is about to eat something. The background shows a lush garden with trees and a wooden structure. The man's expression and the spoon suggest that he is about to taste something, possibly food. The overall style of the video is casual and fun, with a focus on the man's reaction to the food. + +=== Metrics === +Average PSNR: 26.22 dB +Average MSE: 0.002459 +Average SSIM: 0.9856 + +Per-frame PSNR: [27.509328842163086, 26.409242630004883, 25.4619140625, 25.407241821289062, 26.446935653686523, 23.73136329650879, 25.60137176513672, 26.993793487548828, 28.306987762451172, 25.729787826538086, 25.27326774597168, 26.266807556152344, 27.462078094482422, 25.950550079345703, 26.63888168334961, 26.327953338623047] +Per-frame MSE: [0.001774463220499456, 0.0022859980817884207, 0.002843207446858287, 0.0028792270459234715, 0.0022662426345050335, 0.004235099535435438, 0.002753359731286764, 0.0019981153309345245, 0.0014767315005883574, 0.002673137467354536, 0.00296943006105721, 0.0023622140288352966, 0.0017938758246600628, 0.0025406500790268183, 0.0021682626102119684, 0.002329188399016857] +Per-frame SSIM: [0.9894547462463379, 0.9864593744277954, 0.983065664768219, 0.9832437634468079, 0.9867878556251526, 0.9748696088790894, 0.9840085506439209, 0.9884393215179443, 0.991378664970398, 0.9843630194664001, 0.9830930829048157, 0.986225962638855, 0.989523708820343, 0.9850807189941406, 0.9871661067008972, 0.9861734509468079] diff --git a/Meissonic/inference.py b/Meissonic/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..e67cdb72c941f77ff27675cac53ff2f53a0cf03b --- /dev/null +++ b/Meissonic/inference.py @@ -0,0 +1,65 @@ +import os +import sys +sys.path.append("./") + +import torch +from torchvision import transforms +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel + +device = 'cuda' + +model_path = "MeissonFlow/Meissonic" +model = Transformer2DModel.from_pretrained(model_path,subfolder="transformer",) +vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", ) +# text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path,subfolder="text_encoder",) +text_encoder = CLIPTextModelWithProjection.from_pretrained( #using original text enc for stable sampling + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" + ) +tokenizer = CLIPTokenizer.from_pretrained(model_path,subfolder="tokenizer",) +scheduler = Scheduler.from_pretrained(model_path,subfolder="scheduler",) +pipe=Pipeline(vq_model, tokenizer=tokenizer,text_encoder=text_encoder,transformer=model,scheduler=scheduler) + +pipe = pipe.to(device) + +steps = 64 +CFG = 9 +resolution = 1024 +negative_prompt = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" + +prompts = [ + "Two actors are posing for a pictur with one wearing a black and white face paint.", + "A large body of water with a rock in the middle and mountains in the background.", + "A white and blue coffee mug with a picture of a man on it.", + "A statue of a man with a crown on his head.", + "A man in a yellow wet suit is holding a big black dog in the water.", + "A white table with a vase of flowers and a cup of coffee on top of it.", + "A woman stands on a dock in the fog.", + "A woman is standing next to a picture of another woman." +] + +batched_generation = False +num_images = len(prompts) if batched_generation else 1 + +images = pipe( + prompt=prompts[:num_images], + negative_prompt=[negative_prompt] * num_images, + height=resolution, + width=resolution, + guidance_scale=CFG, + num_inference_steps=steps + ).images + +output_dir = "./output" +os.makedirs(output_dir, exist_ok=True) +for i, prompt in enumerate(prompts[:num_images]): + sanitized_prompt = prompt.replace(" ", "_") + file_path = os.path.join(output_dir, f"{sanitized_prompt}_{resolution}_{steps}_{CFG}.png") + images[i].save(file_path) + print(f"The {i+1}/{num_images} image is saved to {file_path}") diff --git a/Meissonic/inference_fp16.py b/Meissonic/inference_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..3d7af22ff083b4606b95bd928e0a674ed83ace29 --- /dev/null +++ b/Meissonic/inference_fp16.py @@ -0,0 +1,64 @@ +import os +import sys +sys.path.append("./") + +import torch +from torchvision import transforms +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel + +device = 'cuda' +dtype = torch.bfloat16 +model_path = "MeissonFlow/Meissonic" +model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype) +vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", torch_dtype=dtype) +# text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path,subfolder="text_encoder", torch_dtype=dtype) +text_encoder = CLIPTextModelWithProjection.from_pretrained( #using original text enc for stable sampling + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",torch_dtype=dtype) +tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer", torch_dtype=dtype) +scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler") +pipe=Pipeline(vq_model, tokenizer=tokenizer,text_encoder=text_encoder,transformer=model,scheduler=scheduler) + +pipe = pipe.to(device) + +steps = 64 +CFG = 9 +resolution = 1024 +negative_prompt = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" + +prompts = [ + "Two actors are posing for a pictur with one wearing a black and white face paint.", + "A large body of water with a rock in the middle and mountains in the background.", + "A white and blue coffee mug with a picture of a man on it.", + "A statue of a man with a crown on his head.", + "A man in a yellow wet suit is holding a big black dog in the water.", + "A white table with a vase of flowers and a cup of coffee on top of it.", + "A woman stands on a dock in the fog.", + "A woman is standing next to a picture of another woman." +] + +batched_generation = False +num_images = len(prompts) if batched_generation else 1 + +images = pipe( + prompt=prompts[:num_images], + negative_prompt=[negative_prompt] * num_images, + height=resolution, + width=resolution, + guidance_scale=CFG, + num_inference_steps=steps + ).images + +output_dir = "./output" +os.makedirs(output_dir, exist_ok=True) +for i, prompt in enumerate(prompts[:num_images]): + sanitized_prompt = prompt.replace(" ", "_") + file_path = os.path.join(output_dir, f"{sanitized_prompt}_{resolution}_{steps}_{CFG}.png") + images[i].save(file_path) + print(f"The {i+1}/{num_images} image is saved to {file_path}") diff --git a/Meissonic/inference_fp16_Monetico.py b/Meissonic/inference_fp16_Monetico.py new file mode 100644 index 0000000000000000000000000000000000000000..0460597cc5a11813b218a95bff1201d88a2014e4 --- /dev/null +++ b/Meissonic/inference_fp16_Monetico.py @@ -0,0 +1,64 @@ +import os +import sys +sys.path.append("./") + +import torch +from torchvision import transforms +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel + +device = 'cuda' +dtype = torch.bfloat16 +model_path = "Collov-Labs/Monetico" +model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype) +vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", torch_dtype=dtype) +text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=dtype) # better for Monetico +# text_encoder = CLIPTextModelWithProjection.from_pretrained( #more stable sampling for some cases +# "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=dtype +# ) +tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer", torch_dtype=dtype) +scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler", torch_dtype=dtype) +pipe = Pipeline(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler) +pipe.to(device) + +steps = 48 +CFG = 9 +resolution = 512 +negative_prompt = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" + +prompts = [ + "Two actors are posing for a pictur with one wearing a black and white face paint.", + "A large body of water with a rock in the middle and mountains in the background.", + "A white and blue coffee mug with a picture of a man on it.", + "A statue of a man with a crown on his head.", + "A man in a yellow wet suit is holding a big black dog in the water.", + "A white table with a vase of flowers and a cup of coffee on top of it.", + "A woman stands on a dock in the fog.", + "A woman is standing next to a picture of another woman." +] + +batched_generation = False +num_images = len(prompts) if batched_generation else 1 + +images = pipe( + prompt=prompts[:num_images], + negative_prompt=[negative_prompt] * num_images, + height=resolution, + width=resolution, + guidance_scale=CFG, + num_inference_steps=steps + ).images + +output_dir = "./output" +os.makedirs(output_dir, exist_ok=True) +for i, prompt in enumerate(prompts[:num_images]): + sanitized_prompt = prompt.replace(" ", "_") + file_path = os.path.join(output_dir, f"{sanitized_prompt}_{resolution}_{steps}_{CFG}.png") + images[i].save(file_path) + print(f"The {i+1}/{num_images} image is saved to {file_path}") diff --git a/Meissonic/inference_fp8.py b/Meissonic/inference_fp8.py new file mode 100644 index 0000000000000000000000000000000000000000..2804af3a06a02400fb69366d414bf419ade183b6 --- /dev/null +++ b/Meissonic/inference_fp8.py @@ -0,0 +1,103 @@ +import os +import sys +sys.path.append("./") + +import torch +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel +import time +import argparse + +from torchao.quantization.quant_api import ( + quantize_, + float8_weight_only, # A8W8 FP8 +) + +device = 'cuda' + +def get_quantization_method(method): + quantization_methods = { + 'fp8': lambda: float8_weight_only(), + } + return quantization_methods.get(method, None) + +def load_models(quantization_method=None): + model_path = "MeissonFlow/Meissonic" + dtype = torch.float16 + model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype) + vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", torch_dtype=dtype) + text_encoder = CLIPTextModelWithProjection.from_pretrained( + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", + torch_dtype=dtype + ) + tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer") + scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler") + + if quantization_method: + quant_method = get_quantization_method(quantization_method) + if quant_method: + quantize_(model, quant_method()) + else: + print(f"Unsupported quantization method: {quantization_method}") + + + pipe = Pipeline(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler) + return pipe.to(device) + +def run_inference(pipe, prompt, negative_prompt, resolution, cfg, steps): + return pipe(prompt=prompt, negative_prompt=negative_prompt, height=resolution, width=resolution, guidance_scale=cfg, num_inference_steps=steps).images[0] + +def main(quantization_method): + steps = 64 + CFG = 9 + resolution = 1024 + negative_prompts = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" + + prompts = [ + "Two actors are posing for a pictur with one wearing a black and white face paint.", + "A large body of water with a rock in the middle and mountains in the background.", + "A white and blue coffee mug with a picture of a man on it.", + "The sun is setting over a city skyline with a river in the foreground.", + "A black and white cat with blue eyes.", + "Three boats in the ocean with a rainbow in the sky.", + "A robot playing the piano.", + "A cat wearing a hat.", + "A dog in a jungle.", + ] + + output_dir = "./output" + os.makedirs(output_dir, exist_ok=True) + + pipe = load_models(quantization_method) + start_time = time.time() + total_memory_used = 0 + for i, prompt in enumerate(prompts): + torch.cuda.reset_peak_memory_stats() + image_start_time = time.time() + image = run_inference(pipe, prompt, negative_prompts, resolution, CFG, steps) + image_end_time = time.time() + image.save(os.path.join(output_dir, f"{prompt[:10]}_{resolution}_{steps}_{CFG}_{quantization_method}.png")) + + memory_used = torch.cuda.max_memory_reserved() / (1024 ** 3) # Convert to GB + total_memory_used += memory_used + + print(f"Image {i+1} time: {image_end_time - image_start_time:.2f} seconds") + print(f"Image {i+1} max memory used: {memory_used:.2f} GB") + + total_time = time.time() - start_time + avg_memory_used = total_memory_used / len(prompts) + print(f"Total inference time ({quantization_method}): {total_time:.2f} seconds") + print(f"Average memory used per image: {avg_memory_used:.2f} GB") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run inference with specified quantization method.") + parser.add_argument("--quantization", type=str, choices=['fp8'], + help="Quantization method to use") + args = parser.parse_args() + main(args.quantization) diff --git a/Meissonic/inpaint.py b/Meissonic/inpaint.py new file mode 100644 index 0000000000000000000000000000000000000000..4fa97623ac9d05d51645a3e61f6cb35a2f90327b --- /dev/null +++ b/Meissonic/inpaint.py @@ -0,0 +1,55 @@ +import os +import sys +sys.path.append("./") + +import argparse +import json +from PIL import Image +from src.transformer import Transformer2DModel +from src.pipeline_inpaint import InpaintPipeline +from src.scheduler import Scheduler +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel + +def get_parse_args(): + parser = argparse.ArgumentParser(description="Meissonic Inpaint and Outpaint") + parser.add_argument("--mode", type=str,default="inpaint", choices=["inpaint", "outpaint"], help="Inpaint or Outpaint") + return parser.parse_args() + +if __name__ == "__main__": + args = get_parse_args() + device = 'cuda' + + model_path = "MeissonFlow/Meissonic" + model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer", ) + vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", ) + # text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path,subfolder="text_encoder",) + text_encoder = CLIPTextModelWithProjection.from_pretrained( # using original text enc for stable sampling + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" + ) + tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer", ) + scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler", ) + + pipe=InpaintPipeline(vq_model, tokenizer=tokenizer,text_encoder=text_encoder,transformer=model,scheduler=scheduler) + pipe = pipe.to(device) + + with open(f"./assets/{args.mode}/cases.json", 'r', encoding='utf-8') as file: + cases = json.load(file) + item = cases[0] + + steps = 64 + CFG = 9 + resolution = 1024 + negative_prompts = item["negative_prompts"] if "negative_prompts" in item.keys() else "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" + + image = Image.open(item["input"]).resize((resolution, resolution)).convert("RGB") + mask = Image.open(item["mask"]).resize((resolution, resolution)).convert("RGB") + + image = pipe(prompt=item["prompt"],negative_prompt=negative_prompts,image =image, mask_image =mask, guidance_scale=CFG, num_inference_steps=steps).images[0] + + output_dir = "./output" + os.makedirs(output_dir, exist_ok=True) + image.save(os.path.join(output_dir, f"{item['prompt'][:10]}_{resolution}_{steps}_{CFG}.png")) \ No newline at end of file diff --git a/Meissonic/output/1499_video_0_CFG-9.png b/Meissonic/output/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..6f2590b825e0d2ee626f55402a2e33289ffd21d4 Binary files /dev/null and b/Meissonic/output/1499_video_0_CFG-9.png differ diff --git a/Meissonic/output/1499_video_1_CFG-9.png b/Meissonic/output/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..8a33a1e9d524b64cf9075f90837a9df07f33e03a Binary files /dev/null and b/Meissonic/output/1499_video_1_CFG-9.png differ diff --git a/Meissonic/output/1999_video_0_CFG-9.png b/Meissonic/output/1999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..7575ff30e7761c1f0504647a427edd78f6808038 Binary files /dev/null and b/Meissonic/output/1999_video_0_CFG-9.png differ diff --git a/Meissonic/output/1999_video_1_CFG-9.png b/Meissonic/output/1999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..05b21ea00869ce35e4e9688e58ca3c52a4857f71 Binary files /dev/null and b/Meissonic/output/1999_video_1_CFG-9.png differ diff --git a/Meissonic/output/2499_video_0_CFG-9.png b/Meissonic/output/2499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..d53cac1c524a8fe1f6bd73c04b76aae0a8376105 Binary files /dev/null and b/Meissonic/output/2499_video_0_CFG-9.png differ diff --git a/Meissonic/output/2499_video_1_CFG-9.png b/Meissonic/output/2499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..0a6d0820f88af25095c25b02944cb1acb6926410 Binary files /dev/null and b/Meissonic/output/2499_video_1_CFG-9.png differ diff --git a/Meissonic/output/2999_video_0_CFG-9.png b/Meissonic/output/2999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..ad320a1185b9152c005072f847ef5dde3759d884 Binary files /dev/null and b/Meissonic/output/2999_video_0_CFG-9.png differ diff --git a/Meissonic/output/2999_video_1_CFG-9.png b/Meissonic/output/2999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..5ff2123f844e6841ed73c21910d705297a3ed2ca Binary files /dev/null and b/Meissonic/output/2999_video_1_CFG-9.png differ diff --git a/Meissonic/output/3499_video_0_CFG-9.png b/Meissonic/output/3499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..eec53728d51b9e1af9dba1cfa3cf74f3cc4197e4 Binary files /dev/null and b/Meissonic/output/3499_video_0_CFG-9.png differ diff --git a/Meissonic/output/3499_video_1_CFG-9.png b/Meissonic/output/3499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..0dbc43bd7c9cfad3ce9c483f144edbe9ee448155 Binary files /dev/null and b/Meissonic/output/3499_video_1_CFG-9.png differ diff --git a/Meissonic/output/3999_video_0_CFG-9.png b/Meissonic/output/3999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..645386bf728382f950dc3f967c934418d516c394 Binary files /dev/null and b/Meissonic/output/3999_video_0_CFG-9.png differ diff --git a/Meissonic/output/3999_video_1_CFG-9.png b/Meissonic/output/3999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..39290327427cb9e67d4475e15e70243271f4bf11 Binary files /dev/null and b/Meissonic/output/3999_video_1_CFG-9.png differ diff --git a/Meissonic/output/4499_video_0_CFG-9.png b/Meissonic/output/4499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..b9efbce8b6ac114730fd09efbec04e94df2fae60 Binary files /dev/null and b/Meissonic/output/4499_video_0_CFG-9.png differ diff --git a/Meissonic/output/4499_video_1_CFG-9.png b/Meissonic/output/4499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..519d909089bd32200a0cc3044ef9f6896f8e321b Binary files /dev/null and b/Meissonic/output/4499_video_1_CFG-9.png differ diff --git a/Meissonic/output/4999_video_0_CFG-9.png b/Meissonic/output/4999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..34bb47a00e7e93745834558d742deef14b142d0e Binary files /dev/null and b/Meissonic/output/4999_video_0_CFG-9.png differ diff --git a/Meissonic/output/4999_video_1_CFG-9.png b/Meissonic/output/4999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..bb40930d3427ac9709e0c67322f938b62782317b Binary files /dev/null and b/Meissonic/output/4999_video_1_CFG-9.png differ diff --git a/Meissonic/output/499_video_0_CFG-9.png b/Meissonic/output/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..393d5d0268f80cb06c1f8993a72edeff1cfbcea5 Binary files /dev/null and b/Meissonic/output/499_video_0_CFG-9.png differ diff --git a/Meissonic/output/499_video_1_CFG-9.png b/Meissonic/output/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..7ba68bf1b1dc65b3545513285d6160b6fe30b584 Binary files /dev/null and b/Meissonic/output/499_video_1_CFG-9.png differ diff --git a/Meissonic/output/5499_video_0_CFG-9.png b/Meissonic/output/5499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..e873d48f858c80c621bf5e9629e0bc6e7d5baa86 Binary files /dev/null and b/Meissonic/output/5499_video_0_CFG-9.png differ diff --git a/Meissonic/output/5499_video_1_CFG-9.png b/Meissonic/output/5499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..2d3df62c04e066bf9a2c07d0d62726d880cc58de Binary files /dev/null and b/Meissonic/output/5499_video_1_CFG-9.png differ diff --git a/Meissonic/output/999_video_0_CFG-9.png b/Meissonic/output/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..43df7ce27f7a3baafa5e96960a7ed3aaf356822d Binary files /dev/null and b/Meissonic/output/999_video_0_CFG-9.png differ diff --git a/Meissonic/output/999_video_1_CFG-9.png b/Meissonic/output/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..b3906600815d5fe97e4c179be3e57442c2adc07d Binary files /dev/null and b/Meissonic/output/999_video_1_CFG-9.png differ diff --git a/Meissonic/output/9_video_0_CFG-9.png b/Meissonic/output/9_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..a0ab40dd6b4066f867491c75ca554c324c4ba661 --- /dev/null +++ b/Meissonic/output/9_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebaa03e6ee352e35fd85ba78544394cb50fd39af6911bfa5b02b4474d61b297d +size 2667967 diff --git a/Meissonic/output/9_video_1_CFG-9.png b/Meissonic/output/9_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..4b134ed9606cf3f05a1fbc638f89bbc175fcb716 --- /dev/null +++ b/Meissonic/output/9_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4cf7d4f5105f3b36f87791bf15994dee255f6f4cf9a80ccb2f4a724601062ff +size 2659087 diff --git a/Meissonic/output/A black an_1024_64_9_fp8.png b/Meissonic/output/A black an_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..015fe42c02fed3bf58feb93b1fdbdeb5dd527032 --- /dev/null +++ b/Meissonic/output/A black an_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c410907d8e44f4ccaece63fde693dc93e4476fa72ef7be724fb8ba108dd118 +size 1290747 diff --git a/Meissonic/output/A cat wear_1024_64_9_fp8.png b/Meissonic/output/A cat wear_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..9a61e907cd87ec9416b8272700c4eec32f0782ca --- /dev/null +++ b/Meissonic/output/A cat wear_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:112bdcb6d3e25688b64eee1d985ea05a1ab907e138bff1d959d2ec89700553c4 +size 1343976 diff --git a/Meissonic/output/A dog in a_1024_64_9_fp8.png b/Meissonic/output/A dog in a_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..618c3497e633f9047c6c20eabff34b403bc134de --- /dev/null +++ b/Meissonic/output/A dog in a_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f48f8a86e731244811146327db159b942a9ac8cb5e91c28fc836872397ef16 +size 1315619 diff --git a/Meissonic/output/A large bo_1024_64_9_fp8.png b/Meissonic/output/A large bo_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..8462c862580319248d4ca4210bc755636dd03d21 --- /dev/null +++ b/Meissonic/output/A large bo_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a468e912292188522cd6ed09387a75ce6b2ffdeda1b0f19c21eee52c14fbfe57 +size 1234086 diff --git a/Meissonic/output/A robot pl_1024_64_9_fp8.png b/Meissonic/output/A robot pl_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..2364499bb7ecff5235319e066dd2283c099abf98 --- /dev/null +++ b/Meissonic/output/A robot pl_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa93b23c58f4d7b5eecb53be2eaf224875068ba5e08b651aeceb5252e185d406 +size 1113804 diff --git a/Meissonic/output/A white an_1024_64_9_fp8.png b/Meissonic/output/A white an_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..b23643b9bad89736faabb6827609bb8f2e07cc74 --- /dev/null +++ b/Meissonic/output/A white an_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1759fa9f8fa18c82851513b3f30db9b081dc7fce46f45c4ae6c76ea1a97d45 +size 670465 diff --git a/Meissonic/output/The sun is_1024_64_9_fp8.png b/Meissonic/output/The sun is_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..7fca89a02c4e628b5705c5f49d6686ca966b50ea --- /dev/null +++ b/Meissonic/output/The sun is_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6489a2450e3f16c712be990336a5831670fa08b2f4c90d5651373337970490b +size 1146796 diff --git a/Meissonic/output/Three boat_1024_64_9_fp8.png b/Meissonic/output/Three boat_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..0cf8121390e436fb2c8eb64dc745c2487b3d8f0f --- /dev/null +++ b/Meissonic/output/Three boat_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bc95ba5fbcd67edafd9f7bd8b3fb8cf044fb67811505dce7bb274e39b24ce8 +size 1088212 diff --git a/Meissonic/output/Two actors_1024_64_9_fp8.png b/Meissonic/output/Two actors_1024_64_9_fp8.png new file mode 100644 index 0000000000000000000000000000000000000000..9cd2808bd5e3c87ecbdbff1249a74ae035dad0a9 --- /dev/null +++ b/Meissonic/output/Two actors_1024_64_9_fp8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2d94a2e5f37e7152b577661d43e273518a94358462de13c5d4c8e3969cdea72 +size 1407954 diff --git a/Meissonic/output/checkpoint-10/random_states_0.pkl b/Meissonic/output/checkpoint-10/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..23abb7edd7ef53310502d505d92cc040a7bfe3ae --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d5bc8413a4e5761d79c7fec6e009d5e2196859120d768c880a42c751163ec92 +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_1.pkl b/Meissonic/output/checkpoint-10/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a1a7990b617190645b040c462ac850946bd10d2b --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6d362005c685999dff7a6bc3081e5ee4c6601492eb2ee5d3bfee06b162d54a +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_2.pkl b/Meissonic/output/checkpoint-10/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b48408a16b732106b23b24042f12435e638d9538 --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a29c4250108f84d1d20f3f4f4f540baaff5d10abbf75d797033e3be4ae8f77 +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_3.pkl b/Meissonic/output/checkpoint-10/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a0970af2f7c4942b4e70d8d1cba5c9f28c9bff50 --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc7bdca0af51cf347504c5f02be8a4300c7f09e3438b5e80ce417d15fb1244ae +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_4.pkl b/Meissonic/output/checkpoint-10/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..20e471d70ad03d2459fae30ad7208e2e4264e55a --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79658f792ad8abf1bc21dd84d06f113241173414722b07b5dce3d4ad65f5ecab +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_5.pkl b/Meissonic/output/checkpoint-10/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..619c080afe93028d4e49dea812153a1bb3b2e64a --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9d523ff9e90a8d0bef5b2348997359a74fc383ef7de31e9b58f67d5b9823d6 +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_6.pkl b/Meissonic/output/checkpoint-10/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..08e96e45343a88bdc74fed906805e83084428fc6 --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1b65098af948d709c887b11cde68714ba9bdad1cfbaacd5e8186f6f76ed02b8 +size 16513 diff --git a/Meissonic/output/checkpoint-10/random_states_7.pkl b/Meissonic/output/checkpoint-10/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e7d057a04318063e960fb101b41475fee61b1d76 --- /dev/null +++ b/Meissonic/output/checkpoint-10/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7276d56f96bca5ac63742a0c85b0fce5aff578d15c4ca7dd676d889ced70ba95 +size 16513 diff --git a/Meissonic/output/checkpoint-10/transformer/config.json b/Meissonic/output/checkpoint-10/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-10/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-100/random_states_0.pkl b/Meissonic/output/checkpoint-100/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0078d53a70b6b80944bc7028b5ba13f8b12dd5dd --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0084fa44d2bc930cc308270ece01f1927022ba91792046892800217322580ec +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_1.pkl b/Meissonic/output/checkpoint-100/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..037f1ee0c7209f9111882b681c929e2170667ef5 --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d833a0b164368eab757faffcddbc7420dcc5d1ff73b7e3597d115e1b763c9be0 +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_2.pkl b/Meissonic/output/checkpoint-100/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7fa5b159b28298ef9977dbe022562510e6cd8de9 --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fc0b21beeaf4eaae6ee43d8bd3d8307be9454493bdcb7022d0c0fe025551079 +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_3.pkl b/Meissonic/output/checkpoint-100/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0ecd1e265a7b88a512e3894411762545640c6fb4 --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e9e2a69e82d5791fe6b12d6fa8ad0a87965e8a1a75edf63316fda93a9eeebdb +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_4.pkl b/Meissonic/output/checkpoint-100/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0e01d1c34e195de9ac7b269c2996dda1055fa467 --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4865dc4376525dc84c5e6dbeef63dac8a06fc0990ab20f6ae8bdd419beda4ec5 +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_5.pkl b/Meissonic/output/checkpoint-100/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5e9b1fa704b3bc98cec48b97800d01957b6c163f --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da4bb574de4d5cfdf1ba7cdd6b82346fff88fd1aaa1dbfbda77c25ee06400f9 +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_6.pkl b/Meissonic/output/checkpoint-100/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8ffa353b8a43a0dc95c76fac36c61dbc0486726f --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f826cf3fd5ae03cada8fd15dbc3c41967ee2e13595d761b87e9233413d64e23 +size 16513 diff --git a/Meissonic/output/checkpoint-100/random_states_7.pkl b/Meissonic/output/checkpoint-100/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c864881a7ade9dd633267000e900aa44de0d099c --- /dev/null +++ b/Meissonic/output/checkpoint-100/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e98ee5f4a8e5524554f6de50879dfd95e2be6254370440c873735a31623aa4d +size 16513 diff --git a/Meissonic/output/checkpoint-100/transformer/config.json b/Meissonic/output/checkpoint-100/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-100/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-1000/random_states_0.pkl b/Meissonic/output/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a29bddda8cf6619869e3eee0d505c5cbb4d76719 --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f940c3fb669b7a7803922fe8f9e65a821d507f91c523fb3bae21554e4c781c +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_1.pkl b/Meissonic/output/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..008aa4813f4e111b7edf7f6aa9cfb47599e23b0b --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05d36549aa0244f71bc213ea273a8ff58bf9290340edbd72a896bb244b4d7b9 +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_2.pkl b/Meissonic/output/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cb5435bfb44064b014cb8a29107d1976d18ad0a2 --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e1ed813e4af659759269d6af961c435b8e5ec6516e727cd17aa7891c4cc3fcc +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_3.pkl b/Meissonic/output/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..977a37d65994fce42f0b8f080dae11bd2caad7d5 --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98b7507d461060c3d08bd5a90294fdb3e857d8421ef16a93ff9e4af719361ec1 +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_4.pkl b/Meissonic/output/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..50a2538ce83d89b2618191c3b6a38343758b5864 --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32dade0fefe749929f8d55628b0ef29261d92aee47401daa5406bf619c67182b +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_5.pkl b/Meissonic/output/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..71a370699890c277b15d991fcef32b1b26ec50e8 --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64cb9b0b33272e3926bee211b4b5c9ec520cf1e66fc2efdf50d97d4e6a357ad +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_6.pkl b/Meissonic/output/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7de5dccc455be6b29d36b71b582fb1979a10606f --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04f2f478b2e490ce63bc0d219fa26fb280836b31e9f4c27886b82168ae93cd4d +size 16513 diff --git a/Meissonic/output/checkpoint-1000/random_states_7.pkl b/Meissonic/output/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d45a16916280baa4507dcf7ba618406ec5cd737d --- /dev/null +++ b/Meissonic/output/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc220d1af521d5840de5e2eb2ba082a5bcb03a23ec36c33897e1f4dcfba7d65 +size 16513 diff --git a/Meissonic/output/checkpoint-1000/transformer/config.json b/Meissonic/output/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-1500/random_states_0.pkl b/Meissonic/output/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5ad8731cf2d578e56993359b48ed4101ddedfb62 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef32bc4aa5abd4ca7b7aa3f23480cf13d429965204bf529035ef664a3cb1d964 +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_1.pkl b/Meissonic/output/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c1665ab80cd45fda40739283ef625b7b83cd7081 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3bc4614a2c1bf96932f43e4270dcdbf071df5c95897ab346953912a340f9e2c +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_2.pkl b/Meissonic/output/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..845dc22e8eba7e4d19dc4fbeff8af933a4653d74 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dba9cb34c81d6b492b1f3860fc760d155dbfacaf3c7b48ebf3c1dd195b75ead +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_3.pkl b/Meissonic/output/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3a0141735c9b759be93c05621f023f52bbc69f16 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61ea3521cff71899fc73256d5f33db45614ced9379d180a2501e5da51641e954 +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_4.pkl b/Meissonic/output/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2fdb22ffc5c923c162430966aea9c52d5fce06fb --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f830aed41c79109752ce6d533385b364e4958a8a96c08f16c074c8be3d54f399 +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_5.pkl b/Meissonic/output/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1278d81a587a64023ef64464ca6f2e7b2d5c5c93 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ea6b990b37e6dc50c423bba41e4f6fcd2f1f4d02b0374a48287095aa7d9e462 +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_6.pkl b/Meissonic/output/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8debcd89164f72783bb5c31783e138dc6e184e36 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d0a596aff14cdbe7a00bd1e780c89f879127f6e1fe92974fbbce3126a98225 +size 16513 diff --git a/Meissonic/output/checkpoint-1500/random_states_7.pkl b/Meissonic/output/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..885e5c9acb62be6af05a0c583f6b5b1a64c68b09 --- /dev/null +++ b/Meissonic/output/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f429726f02b95d19f4d38d9e3989aa099c6503835745373559f96c38dcf407fb +size 16513 diff --git a/Meissonic/output/checkpoint-1500/transformer/config.json b/Meissonic/output/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-1500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-20/random_states_0.pkl b/Meissonic/output/checkpoint-20/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9114281804e289ef21e3c3b2880538669a43eff8 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a7fdfb83b875ae8d767bad5b9096ac7669a46ac17895d9a631d71e1367a509e +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_1.pkl b/Meissonic/output/checkpoint-20/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0fd783313da7dc9e817608e68305cfa9fc6ef5c0 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826dfb69f6db08d6d86f4bcfcd588fe0afb39a8ff0e4e279b13f035bb2c4b3ec +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_2.pkl b/Meissonic/output/checkpoint-20/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5cfcd14b0532f8f1750a4ea086290b4957a37ff8 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aff679ec3180f7e5b467e5e5e7b86d203a928be8031db786a435d647f7e34e3 +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_3.pkl b/Meissonic/output/checkpoint-20/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0d2009310e87433c69ae08ef4f48fa5562d8e6e6 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36317b6083000db5b8634346242a4f98de06a21563e2a2a30f63d86fa6ba6bd1 +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_4.pkl b/Meissonic/output/checkpoint-20/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..898a2441c41a26e8210bcd985619ed17dcf95879 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dc6cf9695894a1c885832e0875289c610ee4c0dc18bea99d7137295f5f18ebc +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_5.pkl b/Meissonic/output/checkpoint-20/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..381ecf7188505b852b8f0630f7e8c23678d22efb --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a25c710e3469b8a3dee5b23e99e647c184d6d8d65657bde8a20e61652091f16a +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_6.pkl b/Meissonic/output/checkpoint-20/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7642c179edc6baaf6d3f78331ceafc16bdd915d9 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c42072a72b219898ad82f2a8fe21210f112ce6e56c0107c96b7db9987fe990af +size 16513 diff --git a/Meissonic/output/checkpoint-20/random_states_7.pkl b/Meissonic/output/checkpoint-20/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..db1e34882ea300f78d82a6290694a92f0d0daf11 --- /dev/null +++ b/Meissonic/output/checkpoint-20/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626b654d9dd38dffc07cde140bd9a2c62771136296e47969d01b2cee8b3b08c4 +size 16513 diff --git a/Meissonic/output/checkpoint-20/transformer/config.json b/Meissonic/output/checkpoint-20/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-20/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-2000/random_states_0.pkl b/Meissonic/output/checkpoint-2000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ad1e74ca038e413f81df8a1a8ce37fdf93d30b42 --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:653d19740de2cb731de5806eab448633d3ce5d3f1fb7357efd2b233e484d978d +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_1.pkl b/Meissonic/output/checkpoint-2000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7f19e4d7c8651d6acc82fbca48218bff020a7390 --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db59dc566cf58864b5f5f6f3e2e9c99b0a1febc3a7c77bee7abc9a9fdda303e +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_2.pkl b/Meissonic/output/checkpoint-2000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c294222e8581a47cc44932b18cc7083b910d80b0 --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54fe39c07e2b2825dd0ebdf71986c2a169f6e946259e1c3b86f36c5e01c1890a +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_3.pkl b/Meissonic/output/checkpoint-2000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..64d956a9ffac1c385bc4291f82e59994e17fdd1c --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f43f2286b6aae0ab98dbfda96f86e95bf852341121d0a1d3fc7fa66971f69dd +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_4.pkl b/Meissonic/output/checkpoint-2000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c5e13d44c650dec196cc8445b0c36ddb1a61169b --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2039d27f209436f9d45f122a5f1bb82dd0b1a6d9b88c35e6aeab5d616c71f425 +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_5.pkl b/Meissonic/output/checkpoint-2000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e585593ee8c035f15162e164a476b6e1b3c21b4e --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc781203de4a75bebdf9ee2a2c6aebb32c0bac96118b087ed84d65e441fd6c9d +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_6.pkl b/Meissonic/output/checkpoint-2000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6c66e3393439ab599e7fc976cba2a9849d9c9781 --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20cf3d423244b010eba200983b80aa498c72ca8deff1972aed851a1ece62c1b1 +size 16513 diff --git a/Meissonic/output/checkpoint-2000/random_states_7.pkl b/Meissonic/output/checkpoint-2000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4e940b50f7211941cef1f1884acd91b9a32612a0 --- /dev/null +++ b/Meissonic/output/checkpoint-2000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807446677b1c21f8a26a796bf0653858e8438c14ff0bcd2ce2cc2e15ee967f05 +size 16513 diff --git a/Meissonic/output/checkpoint-2000/transformer/config.json b/Meissonic/output/checkpoint-2000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-2000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-2500/random_states_0.pkl b/Meissonic/output/checkpoint-2500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..253cec26079e0f881ec2c4d71dd772866f59b2b6 --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b351267f5bda75cf052de873f85bd242b21f2e9b9576720c3ea5758adc69668 +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_1.pkl b/Meissonic/output/checkpoint-2500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5110f70b1804e713efc0d9db3e740924ebef54d9 --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d7350ec7cda501f8ca6c136fd7c62bd09859daccac415abe5353aa2f7ca6e0c +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_2.pkl b/Meissonic/output/checkpoint-2500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..434b5d06864948be785be9f14398df506caeb8b3 --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46127ece3af71a2ca93e9758268d4f7c698700943fa4c9d732e506cf8e897401 +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_3.pkl b/Meissonic/output/checkpoint-2500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d05378651632f0e0f141074f5946f1d643e91caf --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46020660e6e32fda56e75812406f28e347c7b14569a8b14e3f3756c738e35517 +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_4.pkl b/Meissonic/output/checkpoint-2500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..611f4830cacaf50d998c9c81821415afb55b28c5 --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:717e18ea4713b6e25e782b4ccc11a4daba2b23741753346f7dcc1bdb0adedf11 +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_5.pkl b/Meissonic/output/checkpoint-2500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3c754985ae6a66ef4ffc9258d21b9ec06e06983d --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae543d3ba1abe0bee6b0feed8e7ded081c5938394d17ec4253201019386d0d57 +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_6.pkl b/Meissonic/output/checkpoint-2500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..df043ad2b0cefb3deb3fc90bc9268bf5847b294c --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b21e7959305c94c019feca4d8d57f51de0dcce7fff68709f6b175ac66ef359fe +size 16513 diff --git a/Meissonic/output/checkpoint-2500/random_states_7.pkl b/Meissonic/output/checkpoint-2500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..481c28bb67390892fb73056c0f8c84415e76f9a3 --- /dev/null +++ b/Meissonic/output/checkpoint-2500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ea116d20122f55ce31fd8bfd0ea5cbed9999e07ba8c7bdc94c61150ca4bde5 +size 16513 diff --git a/Meissonic/output/checkpoint-2500/transformer/config.json b/Meissonic/output/checkpoint-2500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-2500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-30/random_states_0.pkl b/Meissonic/output/checkpoint-30/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bc1a5f0a7127126017bf978e00741857441de670 --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1701c367b5cfd0206b8b1aac2bf75b5caf0dcec5dacdd16d36cac25591ba3810 +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_1.pkl b/Meissonic/output/checkpoint-30/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1555230e966f2bb1933c3f79e6915a97ecdec805 --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4d937026efb2c704cc523b8117dcb87fafa6f7330ba48c966d3e06f55745a23 +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_2.pkl b/Meissonic/output/checkpoint-30/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2e9f869f989a9c88318cb4777b4b56ac5722ee02 --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d96b778add84631abcd4218c3f8fff8dbc6b502b94266e3117c58589c3e3b70 +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_3.pkl b/Meissonic/output/checkpoint-30/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b258e34ea4bd0f149a93eef76fbf579cef34c84e --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e0b10726e84e2a6aca047ee71a8448222ffee3fe0926b5e75cf1f0530959e77 +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_4.pkl b/Meissonic/output/checkpoint-30/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f28a7abc182ba35b0ed0bd6107e94bddebe6401f --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97c64eafc59750c156ea3d9e778e5bbecc680fa09599301e205ec34fecdbe993 +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_5.pkl b/Meissonic/output/checkpoint-30/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..51b169d20235bf6f80f648ebc43d6132749f09d5 --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80bfd2b2c7adc1e5dd152ab21d2c00a64c386978271d8a4b42dfda49ce2b9e8d +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_6.pkl b/Meissonic/output/checkpoint-30/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6be2975801fee922eb0de555cc2b9c7c9101b7a0 --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b7f8bd8fff566f3f28bb093af00af31321f449985cd5eaf97c578ee3da0eeac +size 16513 diff --git a/Meissonic/output/checkpoint-30/random_states_7.pkl b/Meissonic/output/checkpoint-30/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7f99f982eb2a0db2e6056226f7e0eaf47229d22d --- /dev/null +++ b/Meissonic/output/checkpoint-30/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a1696202639d34b4513f9c317187e80dcbeaef1e2014cd79c141296bcac310 +size 16513 diff --git a/Meissonic/output/checkpoint-30/transformer/config.json b/Meissonic/output/checkpoint-30/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-30/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-3000/random_states_0.pkl b/Meissonic/output/checkpoint-3000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bab88682bc368c2fdcdb92eef1a8faa2fbe3c1f7 --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c90209db4b0a636ab74caf90aaed023a81b0f84dfd50205b74fb8f044a66927 +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_1.pkl b/Meissonic/output/checkpoint-3000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8cd5e9c9f2f3d0828245b9ac433871ebd36e96d3 --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0214bd6a919f5f0bedf2403c1ab4a567e1dcebf700d8a5c7e477429897e4dd3c +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_2.pkl b/Meissonic/output/checkpoint-3000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..883fe57bcd94c4c5a855e6cd73138c055a1360ca --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:501d41a8533aeaf687e198b6f59711890e507f6ca71ee5daef9876d538c71b21 +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_3.pkl b/Meissonic/output/checkpoint-3000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d8de92767cf909b2cefbf8552c2516648fbcdbc9 --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f0508014726289dc6bc087b956d7837ef74e826f84963c8b58ea5d607689af1 +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_4.pkl b/Meissonic/output/checkpoint-3000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c6009a49171626afd7652f3cb205d6b7dbc5ac3c --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3eac2f72e09172efd7bd80aa653438bec91d91eccf70b810f9718b81620ac1c +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_5.pkl b/Meissonic/output/checkpoint-3000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d7c289fc5db8d3089b45e79d975ab7eb83688a4e --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe06de9fe1105cef39bb2a25675ec280de7696d280de116a9c35cbef5b9a624 +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_6.pkl b/Meissonic/output/checkpoint-3000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..62c0f92cd17a4722f28ad7d462a09942f7243a07 --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e52bcbe44171fe2f86b9b9ce5927cb162959ef37d78da7264c4c46e825ed171 +size 16513 diff --git a/Meissonic/output/checkpoint-3000/random_states_7.pkl b/Meissonic/output/checkpoint-3000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8775b5da65a8917902eec52d7e0870175525ffe4 --- /dev/null +++ b/Meissonic/output/checkpoint-3000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6477f22e11796d6cfdfd057ecf670b5b225890005f68dc03dedf0edbaf888538 +size 16513 diff --git a/Meissonic/output/checkpoint-3000/transformer/config.json b/Meissonic/output/checkpoint-3000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-3000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-3500/random_states_0.pkl b/Meissonic/output/checkpoint-3500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a6c145f472e325bb7ff5f161e146ba72f89daa7b --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5289d6142f81c1e1341195ac130f6618f985116929ae5a760845fb4e55d1aa +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_1.pkl b/Meissonic/output/checkpoint-3500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1a35ed2e93742d10cb95aee6d80b1e0b4de95c68 --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c59f99b2559fc96513b2872d96e708647ff8526c9b9cb0ff8b68720388f8abeb +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_2.pkl b/Meissonic/output/checkpoint-3500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..adc7881efb557f6c6edd0acd3067eebeecd5641a --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7acf9bdf8e249747bc4a3881328a6d8a22d0b3654d92622a886dcf95e83720 +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_3.pkl b/Meissonic/output/checkpoint-3500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..810c1913067cd6c31d7b16e1e4b6263d8443ec40 --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e423bbd9bf4fce7de9d063d4ff3e4e7f3dd99ec75409d746d5eab370090b53d9 +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_4.pkl b/Meissonic/output/checkpoint-3500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6e0b4652d665f5fd7a5930abc405a8e2989fd4e6 --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a95515d8758fa0312da6424dfc24c80748357c31fb7d1c3e110ecb27512d8e0 +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_5.pkl b/Meissonic/output/checkpoint-3500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..51613c375c1916f8e629a1d451bd7969ae114ab7 --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7258ebfe0fdcd0497eeb12e0684d0fafc4b48346d3d0e54aa6e4438341735c2 +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_6.pkl b/Meissonic/output/checkpoint-3500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a51046adb7e961234878121e144d1707df7986ae --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd1613e018393a96c4e83f2a8ae84763c1a22f69292e9e7efdab0535376e1fc +size 16513 diff --git a/Meissonic/output/checkpoint-3500/random_states_7.pkl b/Meissonic/output/checkpoint-3500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2bb98b1c8711d4b6b199c2b990aac53baa5f1b72 --- /dev/null +++ b/Meissonic/output/checkpoint-3500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8880d1bb7fb76a462a7607b758bc69cc2a26a0c03d3a54e24db883cf31fdad1 +size 16513 diff --git a/Meissonic/output/checkpoint-3500/transformer/config.json b/Meissonic/output/checkpoint-3500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-3500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-40/random_states_0.pkl b/Meissonic/output/checkpoint-40/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ccc2ce0649cd3cca5d0c8a2c9c74c2bd2be2f1c5 --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51c915e0952631f5fba7290b5ba6db588753febe4af9347cc65cb95db0fe6dd +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_1.pkl b/Meissonic/output/checkpoint-40/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1fd019cec257f95aa8dc5762b6b6ab041029d76a --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4421348b769c39ad6edec74235b5ccc3e581256af393d38284528b58061522c0 +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_2.pkl b/Meissonic/output/checkpoint-40/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b19e5aba7af597e02a764fb97d0d97c9c66e20a3 --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581f6c16da7a50a002794fbd4518470b899ca0483036afd8d4da6d9bcb8e280b +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_3.pkl b/Meissonic/output/checkpoint-40/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7e6f7dae179abfb5b656c9611bfce395654f6bef --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a491c38a1f6f483a909126d30877d903d8d344bff226010d7e2a82e5e7e74bd +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_4.pkl b/Meissonic/output/checkpoint-40/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8795d8b1ed5aa1147a741b338db6b520aa4bb0d7 --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f34bc2efb358a9daf2f69c07a546cc14a435fa4a0f039837e57afa67c90535f +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_5.pkl b/Meissonic/output/checkpoint-40/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..873a448c3439e982d3d36ab5c077dbea57638ee7 --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea058dbcb294854130651a8bf2b54fabd891163a549e25718f3d3cfa426b7ceb +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_6.pkl b/Meissonic/output/checkpoint-40/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9665cc16d719ea7235d59897526ddbf1b28e44cf --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39518bcc5444700737cab403a6359fdac6cfb58f524ee972c4f95875b346d05b +size 16513 diff --git a/Meissonic/output/checkpoint-40/random_states_7.pkl b/Meissonic/output/checkpoint-40/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e854e390d14cac159792176408402ce9422e7255 --- /dev/null +++ b/Meissonic/output/checkpoint-40/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4421b38edcb780799d55cfa803bd728a43c38822409c2a5cc01a107c55006f8f +size 16513 diff --git a/Meissonic/output/checkpoint-40/transformer/config.json b/Meissonic/output/checkpoint-40/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-40/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-4000/random_states_0.pkl b/Meissonic/output/checkpoint-4000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a83668e423093d743b44407581e40479c0116fa9 --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ddc82bbaf1b0ba91cd038648579a58c536a8aec87dd456504294eb5c8acabe9 +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_1.pkl b/Meissonic/output/checkpoint-4000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e13619d73ddd3cb2e180ff14fb10db07b8f915d7 --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7452cb57f95d7bd3d3222abe7732a8609f5b29bfc240c22a2194cab398097ff +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_2.pkl b/Meissonic/output/checkpoint-4000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..259d4da00505bd58d8a2298935e5e897f3220680 --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3df80c8a8ffd629bd2e3ee9052784ecec7b92a796e6cf65377114a0e88cb081 +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_3.pkl b/Meissonic/output/checkpoint-4000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6a3af419e2870d00c41bf42f2aaf302c634e82a4 --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fa323bd63f084411d3b0708eb54de1ab0733abf2f8e8b0c96b9061e07f7e650 +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_4.pkl b/Meissonic/output/checkpoint-4000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e8bec12efb7efe6d8853674006b97a3153e58c94 --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06b6fd0a186f49932eb6f595fb2067be69f329d54e64a9fa95620604a2346763 +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_5.pkl b/Meissonic/output/checkpoint-4000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d32c9af2c8ccc5f154b70ac02beb37157cdef159 --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee156de615c7ec34db9752cbf35f599aea29b4e87ca90dc3a797c36ddce5cea +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_6.pkl b/Meissonic/output/checkpoint-4000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..af33486b41835e3cb35d40216d8928e2ef875b6f --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f2d194d564a1a0dd567572335bd82bac97c745afc4e2971351d3d3a8536bfd9 +size 16513 diff --git a/Meissonic/output/checkpoint-4000/random_states_7.pkl b/Meissonic/output/checkpoint-4000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..917102c146aa67dc6bfc2b13cb04dd08e07b270f --- /dev/null +++ b/Meissonic/output/checkpoint-4000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b7fcb309b0825199111ba84f60f4c0ad2b1d3f1b4f2a4f0ad9e43098701e6e +size 16513 diff --git a/Meissonic/output/checkpoint-4000/transformer/config.json b/Meissonic/output/checkpoint-4000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-4000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-4500/random_states_0.pkl b/Meissonic/output/checkpoint-4500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ae0b86c47769bbe34ccad5d09ee78ef2fe057b63 --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9341474ac66cf3a3fb5b6d6caec550d6079205a838f235cacea748b59e96a3cc +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_1.pkl b/Meissonic/output/checkpoint-4500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f3efae0c6425840c0b33469cf196174f806a0698 --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9ba686b12115206afb8719923082402cd883d04549737d810e0d1d58c3a198 +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_2.pkl b/Meissonic/output/checkpoint-4500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0513372a7b0af5b61724d566b1cd60760df67d8d --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d42e9febebc934ec9368c37715883c2d67378d59ed026d99e87ada76b119371 +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_3.pkl b/Meissonic/output/checkpoint-4500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3812678b76fe087117970665fc920d24eef676b4 --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dabb629f7769f1b4ea3fe8db82c6e948e0ba701e67fe42d89656ea9617e851f +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_4.pkl b/Meissonic/output/checkpoint-4500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bc9af5dddfa6ebaae3440376dbd86d1b06f89e2c --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b54ed645c4f1659a6605967a1bfa541d47b4de501b0e3115a9aa2303bf483d8 +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_5.pkl b/Meissonic/output/checkpoint-4500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ef4da6b86976535a6233e5b996396af27b246fd4 --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5645f1b57b11ebbc7efbd0d93964900498c1c6a2992015e0541b048f17a4080d +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_6.pkl b/Meissonic/output/checkpoint-4500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5b459b90c8900ed3d211769311f00e4b025ca258 --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2bce294e0d9662c404de739601baf1bca1defeaf9e3ee1c0e80a03064df7541 +size 16513 diff --git a/Meissonic/output/checkpoint-4500/random_states_7.pkl b/Meissonic/output/checkpoint-4500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4d3e9f6871af8c6768ca8fbbe79aa89d9f3e5e60 --- /dev/null +++ b/Meissonic/output/checkpoint-4500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00ddd43d16e45a82113957a5832cec913b9b66466d9963362181945e693a57f +size 16513 diff --git a/Meissonic/output/checkpoint-4500/transformer/config.json b/Meissonic/output/checkpoint-4500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-4500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-50/random_states_0.pkl b/Meissonic/output/checkpoint-50/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ec2235b6135af353e24b5fa6e7410d9efe282439 --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426ef2db72fda226aeb9700953a9b37f7f13402b7384a085d6178d776b7ee46f +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_1.pkl b/Meissonic/output/checkpoint-50/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ead643a19d8a5a56e559ee92ac734fad4a42e219 --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81591f8d336a22fed30b2aa4b699f4fa637e1027782b7c4ed326fcd0c105c1a1 +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_2.pkl b/Meissonic/output/checkpoint-50/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9a78bd90d670d417e3a0be004efe404e6884ab69 --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c51baafd0a01248a270cffd537c6ed29c735fd5948d74b44261aa37de268a91 +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_3.pkl b/Meissonic/output/checkpoint-50/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..66348ba90ddcc78586422e71f662247cecada050 --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d145d12ae63f2da07bd353bbcdace98ec702d7b651d81687bc0288f601cb77 +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_4.pkl b/Meissonic/output/checkpoint-50/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..74fb5d74880f6848d1244cbb307012ff6936a40d --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7875006cff1ce1fad111bb50e486f0bc79635750ad4fb15dc151f5d685298ed3 +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_5.pkl b/Meissonic/output/checkpoint-50/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..76dc086b6cf3a8b0eb2157a9cc7709dc1fb636f2 --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:801c9f40381dbe106095764959346f0ae51da32230da0ccb276b328d4274d0ba +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_6.pkl b/Meissonic/output/checkpoint-50/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a5cac471cb40e790805d96043d68e2416da920ef --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:684c654f45468513c6a1be5c5ffcb24b5c50adad6ea73ea1904a931305f6d0c9 +size 16513 diff --git a/Meissonic/output/checkpoint-50/random_states_7.pkl b/Meissonic/output/checkpoint-50/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c94b8b2379774ecacbbef31221c979aeb70e521c --- /dev/null +++ b/Meissonic/output/checkpoint-50/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51bc16749425e127b9e8b16dcc08612e2e1504ce84b6ddcc2291bcdea87aaa9d +size 16513 diff --git a/Meissonic/output/checkpoint-50/transformer/config.json b/Meissonic/output/checkpoint-50/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-50/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-500/random_states_0.pkl b/Meissonic/output/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ff6e15f3c32949277e7b6099d98882de013bd4b9 --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d25cd6d925db223c3a147d70de39660e981aa09f3dead0a564ef484d6cc0c7 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_1.pkl b/Meissonic/output/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9e59c2aef41eb388b3f7347235e8997b9b375c97 --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:421cc690730961c06150d7d1c8d003187734396b359b84d7949a8a2fd5e0ba19 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_2.pkl b/Meissonic/output/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4e015812a8cbd72fc7c8a4ca2df0eab356949c9f --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4982db334e93509c95cbcd93ee350e495cbb9eeb3a98f87e5ea059e2fb005cc4 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_3.pkl b/Meissonic/output/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..063e4bab9f9a2c13f9b4828c678d51291fb7135b --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffb82c5a77f4cd1494e2728da7aa1ad03c92e3df2719863be2cd000373d5fa60 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_4.pkl b/Meissonic/output/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..37ecec552d4998b40c7ff09e077af81c1ebe2827 --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b79e0b9db77845853f49a8f589b86997e9f8a6d8d468f95c6179a28978f7389 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_5.pkl b/Meissonic/output/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fe5e4ee6e899ab6e0f61bbbcc6f3cf2fd1863635 --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:445d911e7768b6402dec921311d58fff9550e2c04c48483332143ba9419ec334 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_6.pkl b/Meissonic/output/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..12c663db4175b3ea95149546c464f99f38868af3 --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f70f428fb0509e7177f656741dbed82b31451963f0e7ce9b405f8b15b1b5df85 +size 16513 diff --git a/Meissonic/output/checkpoint-500/random_states_7.pkl b/Meissonic/output/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..92fab2b46d708b30b7c1804f47b4795164deb194 --- /dev/null +++ b/Meissonic/output/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd80c646e82f69a654245848b3be361afcb8ab8fb4793cc03d7d017b7024cc90 +size 16513 diff --git a/Meissonic/output/checkpoint-500/transformer/config.json b/Meissonic/output/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-5000/random_states_0.pkl b/Meissonic/output/checkpoint-5000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..44d59f7f41d096f05e9bf029ba024db12c612cb7 --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d71fa5e30d6b0134b6ee90cf82ffb73e81c7bde950675d3f266d817c1a0cb08 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_1.pkl b/Meissonic/output/checkpoint-5000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1712247950e665ac5d788d770232ee605df3d12d --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0886e37e3dc76cf653562895ca4f85975e49501a70bcede0e3720f8770de8c7c +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_2.pkl b/Meissonic/output/checkpoint-5000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..64219162d4291576e9418cedc896581b9cd585f4 --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c5b83fbe32a7dc28531617ff621e2888e9e7d074dd345a7ee48c9b853687f8 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_3.pkl b/Meissonic/output/checkpoint-5000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d668f10d4034b6f56d9f73727372e35f7121896c --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35ddddfcebb4e9e4e3ac61e90c0996479fd9db6eea2fe5178cd1978014f1b902 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_4.pkl b/Meissonic/output/checkpoint-5000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f1b77694b18975a4791338282a627daf3a98aef3 --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:468d94bb95ae58e886aa64d0ca6a186c503814fc088337d73f35c721750508b4 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_5.pkl b/Meissonic/output/checkpoint-5000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..75a7285d890c112611df2ab5a92c162e3491e62c --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90de2006a26efbabefc529d825d84863bb9a6436b851ea8a31fd4052fa7f75c3 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_6.pkl b/Meissonic/output/checkpoint-5000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2900bd78bbe9e0defb17ab001b0b5fcf6960b38 --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adbb3983282448e5c6ec8221d209ac8ad3779822549a3009f2e0701764235316 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/random_states_7.pkl b/Meissonic/output/checkpoint-5000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c2b842c6cf03ae9a41c48fe25fb79cdcfdb456bb --- /dev/null +++ b/Meissonic/output/checkpoint-5000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a5cb8b6e8ef24f5883e357e71552a4c615e3a9854bb50d3d58d0840853bf30 +size 16513 diff --git a/Meissonic/output/checkpoint-5000/transformer/config.json b/Meissonic/output/checkpoint-5000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-5000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-5500/random_states_0.pkl b/Meissonic/output/checkpoint-5500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d7b74e63a9a67ef74f2544ef931e8d84f846cc4c --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d45ecc84f6ca409f1850de5ab18bdf450dd55ee0536d9526e5d1bdcb37280f4 +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_1.pkl b/Meissonic/output/checkpoint-5500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..efdda30eea825e81291430679445e7a41c176efb --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3aac735990729352d60ab456a5d2a981831d9c73018aa51f6c47137ab4e1e7 +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_2.pkl b/Meissonic/output/checkpoint-5500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..626c9ecc4cbfa4edcaa744becb7100dd62da70a0 --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1e177f7c30c4b1172ad7dc6b3a642158367760a730331051f7bf2d59ad8d3c3 +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_3.pkl b/Meissonic/output/checkpoint-5500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fb9f4fae320e78306b1c57c75019ec81708b0283 --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e9347f0ce8bdc4a7997e193472c8861124974f4e7ceef501a847084ba95412d +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_4.pkl b/Meissonic/output/checkpoint-5500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f79ffaf3500255f7b3dea915a222b013077484d0 --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f298b75adc0a874ed463518599823a705f103758460b68e100c3a4cf72372a83 +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_5.pkl b/Meissonic/output/checkpoint-5500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..98c5802309f571ba86e4ce4d0b119a77810be6c9 --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cc7e7678fa346563be32cdeaa24fca245d0e061044b60c50d414aa63e8d2c6 +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_6.pkl b/Meissonic/output/checkpoint-5500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..50f184b1eb9557def6c2037608c65bda421555eb --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955813cf12bd05f3e5642916c2087218beea492ba9a7a16ce8fe916b0a9ce2cc +size 16513 diff --git a/Meissonic/output/checkpoint-5500/random_states_7.pkl b/Meissonic/output/checkpoint-5500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..205513ef3d26a520969d9341941bfe7c336215e9 --- /dev/null +++ b/Meissonic/output/checkpoint-5500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb336ea4bcaa4453cc0bf1f91715847b3fb1662104de1703bab87164c55750f9 +size 16513 diff --git a/Meissonic/output/checkpoint-5500/transformer/config.json b/Meissonic/output/checkpoint-5500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9a57e4b0f5d900c6516e9993bb1215925606473e --- /dev/null +++ b/Meissonic/output/checkpoint-5500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-60/random_states_0.pkl b/Meissonic/output/checkpoint-60/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ee918630ef9874d2f8df2ece6996b40145ce7615 --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636525cc6d2e7f5b5dc5bdfa433b6bcc9ea67b008b94506a28383f4b35c1c77e +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_1.pkl b/Meissonic/output/checkpoint-60/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..38619698da70a93b47cfa8e954985f19e13d890e --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82c9114e3a3bc89e91f7bf5d3b55001cbe4d1f86d3b195d746f72d474b55b5cf +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_2.pkl b/Meissonic/output/checkpoint-60/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..af448e705ab537f357c223aafe44585aa0a57ab1 --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b294bb097fc8d5f892bf1b9a032dc2df96503160c9056f875adcae8a45b49207 +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_3.pkl b/Meissonic/output/checkpoint-60/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0119df74b5848bc4243f1506fec36d6f49ed0c51 --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db9725abc46d728618621ecf55611f59512d2bc7c07e06b0a2463c0dc441581f +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_4.pkl b/Meissonic/output/checkpoint-60/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..25446e33b01826024b50a41374b3e88597514f49 --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f8fd3a780a106303416dafb02764ac25c964fbe3d502454ad6ff2bcfad7e206 +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_5.pkl b/Meissonic/output/checkpoint-60/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..81a69e03d00ffb742aca467d3214d9c3ee37b089 --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f29d3ba01d97d1e84eb8abe2f7f9b72ce1340661400bb459c653bb6a9f034b2 +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_6.pkl b/Meissonic/output/checkpoint-60/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e7a8824b1bc814df6a3f2e03a27b0e862c5c7624 --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:796de367e6f786cd5973d8b38d870903cb0d823a7f485350c51b62959d3ba5d0 +size 16513 diff --git a/Meissonic/output/checkpoint-60/random_states_7.pkl b/Meissonic/output/checkpoint-60/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8ebe15e981a41d484427b605e28bb0af220d1e8e --- /dev/null +++ b/Meissonic/output/checkpoint-60/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:375c14000341ab0561f667e53e118b6770697e22c9fb42326ddf1a29fbb488b6 +size 16513 diff --git a/Meissonic/output/checkpoint-60/transformer/config.json b/Meissonic/output/checkpoint-60/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-60/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-70/random_states_0.pkl b/Meissonic/output/checkpoint-70/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4f9d5e0f7f58a77906f2dd457b7ffd64bd424c14 --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef166eb5a207988936eb32f5faa238775cc608b0ab0dd6d95010999880766f56 +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_1.pkl b/Meissonic/output/checkpoint-70/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4262abcd9a94858db2931b8412a462ada78197e2 --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:560cff6b5c6e94633053d0a8d661657d30bcc361d77162c566905d73201eda67 +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_2.pkl b/Meissonic/output/checkpoint-70/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..849c68ba0da795f8dd3bda19cf88e9b97f8998e3 --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:113b747f85801c3e4e640cc9777c4b7737b4d5b100b8e326f170f67e31efb37f +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_3.pkl b/Meissonic/output/checkpoint-70/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2312b06349df8e9e12eb94ed24686db1983a6de --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dae3397c5806eeac2698c3a2c35f7176158f1714e20a60ab401b6d085efffe2 +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_4.pkl b/Meissonic/output/checkpoint-70/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..648d7359a542a1dc063f2254654188e494d6839d --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147db2cb8a1c2c8ffd80a471e3327f136af1d134f4e35bcfc508bf9bee66b8b5 +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_5.pkl b/Meissonic/output/checkpoint-70/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fd80ca0328e20a47948c832fc4732b1870d4216a --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc306fbef284c9cf322a8bbd89066823e7688e899ba12042e4ccb7c1805d6b77 +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_6.pkl b/Meissonic/output/checkpoint-70/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a837e99682c57d7d12acb6ee13935a8fe30f320d --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f1e47854266846bdbd09fc3ddc3fdfc6cebeedef729be91825e956413c7de8 +size 16513 diff --git a/Meissonic/output/checkpoint-70/random_states_7.pkl b/Meissonic/output/checkpoint-70/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e4c05770cc2b5d77ea0948888094865c36637748 --- /dev/null +++ b/Meissonic/output/checkpoint-70/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15c964350ff2e176d08069e7e45427513a232babbe0a72404c4ad2cf464a594 +size 16513 diff --git a/Meissonic/output/checkpoint-70/transformer/config.json b/Meissonic/output/checkpoint-70/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-70/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-80/random_states_0.pkl b/Meissonic/output/checkpoint-80/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d27096a8bf1f6e29cd7d726f02870da3697db727 --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:609745afaac84b768b58a83c7576c3da658ceff135ab85624bac8dbd7b8fe73a +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_1.pkl b/Meissonic/output/checkpoint-80/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..df0ff0ae5f15d08e0c8f1f01425bd0e9b7cb9e92 --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:498c78decd43dab3cf4f1bae5c5ce5a7b8c7f5c74f57bdd30a916878facf0072 +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_2.pkl b/Meissonic/output/checkpoint-80/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..eede651baa0abc4882eb7070a76d90733424344d --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e136a8904576667b8e1746f4d1727fabbe898167edb9a225b559fcea0cdcac42 +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_3.pkl b/Meissonic/output/checkpoint-80/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..07f3129c99bc3b1917adc7604c4c6ec3f5e06db1 --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea594c2251224e5fcab581928583f8814adb4fc34f8a5343c84fa2caaf370261 +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_4.pkl b/Meissonic/output/checkpoint-80/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..235a85bbe949509efccce9cc64b7d66eeddac38f --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c469ea141aec322f274ca664a6754a580f0399245889748dcba2c6b0bc76c589 +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_5.pkl b/Meissonic/output/checkpoint-80/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f09ee65c520f19deaee55bfc941b9475271bda54 --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d22a46dfc64de5a74f8afe1b30efa6d811c57ce37f38536067684f512436b1c8 +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_6.pkl b/Meissonic/output/checkpoint-80/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7e07a7769886e464871d13db6ce3da10bf61ec19 --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ce4a5fbdb88ee497ca92117a126bf46f02d0bca8a60ec34c3cf3da15878101 +size 16513 diff --git a/Meissonic/output/checkpoint-80/random_states_7.pkl b/Meissonic/output/checkpoint-80/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ffd4aeefaf77db63c69ce75388daa7d90dbe1291 --- /dev/null +++ b/Meissonic/output/checkpoint-80/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7741d0dfa08088e981b76b65ebf9b5b0c1dfc66b447f272c1968ec48921b3322 +size 16513 diff --git a/Meissonic/output/checkpoint-80/transformer/config.json b/Meissonic/output/checkpoint-80/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-80/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output/checkpoint-90/random_states_0.pkl b/Meissonic/output/checkpoint-90/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ce9c2af6037bf0a4a06ce38704f7bf41971238dc --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:650a582f007453dd55abda4450e3e64ff9581109d93fdc1d4ab4a8f70833c5e1 +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_1.pkl b/Meissonic/output/checkpoint-90/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..88ab2623740419c3916ad968b73c676f8b4fa77a --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05a5a254c8129b20900b2f5a8d6f152d2be175517e7c0e1d739478c539a42e76 +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_2.pkl b/Meissonic/output/checkpoint-90/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c5f34a9b008093469cf6f871c00dee32ed3c1a05 --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2a5a66561e3f76835879d0f7ecae023085e0e0188c8ce31256eb380cdca2608 +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_3.pkl b/Meissonic/output/checkpoint-90/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..aa0f88c8439f25fe0282a70861b7d708576f338a --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a5825b4cec705c1a986d2e256a82e18cede0715a0f0d353b14c41df8869fb4 +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_4.pkl b/Meissonic/output/checkpoint-90/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..226101cd8ae3aea44e2fcdef2e743b8f01b3d6d1 --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cbd8a6f4ae7e81183bf18f31ecc7b8358b395023d254a32d0ea6c9531fb0284 +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_5.pkl b/Meissonic/output/checkpoint-90/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6b222b328ed374a256f41741df6dcfc19c570ad0 --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a93136109755b177ab98dc481a2935207da5f0ffd10695618dc106e650b00f +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_6.pkl b/Meissonic/output/checkpoint-90/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a3a28634b1d83db8a1dc1d5c7aedec80ea486c61 --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8880cd4e3b5598f2ab83c0622b916704ebba9c9d5e4e3306a01ada465ba52c +size 16513 diff --git a/Meissonic/output/checkpoint-90/random_states_7.pkl b/Meissonic/output/checkpoint-90/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2c01fbec31aa3f027df0188def01ed405562215b --- /dev/null +++ b/Meissonic/output/checkpoint-90/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e9a81ab13dbae1754bec3c4a81db7f2fe93a198c9eb6ba4059fa80c135b3848 +size 16513 diff --git a/Meissonic/output/checkpoint-90/transformer/config.json b/Meissonic/output/checkpoint-90/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6db18ad87e283b380f5ebb2a1b2958bf64fd74f2 --- /dev/null +++ b/Meissonic/output/checkpoint-90/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 30, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 3, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 53, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/1499_video_0_CFG-9.png b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..4c7187945690a70279354741cc60575fe8e5d2ab --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/1499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d7d215080b273e9155fabd655e8d067a165f6e2554fd135b5233869915f077 +size 452138 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/1499_video_1_CFG-9.png b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..86f2afb4ae6521a3ef13f10a289e3a29c9d7260f --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/1499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:736c017ca88662cd1d11e43ada8129c0fb46f23101f73c275e6bfae1f95d7b55 +size 452537 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/499_video_0_CFG-9.png b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..8396f8c64b07e2f15cad630a54e1f9bac46f8758 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f69c990d95a223b9d06117c623b8a353a9415a910fddeaf43bddb5fd465c2b5 +size 330201 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/499_video_1_CFG-9.png b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..6522c3ba8b5dbd7111ac918ba956841f177a9b48 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc1dbdaeeaef4847234f332b9f24bad9d5540cd20f35cd09af9b6b8bd26e96a +size 323804 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/999_video_0_CFG-9.png b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..878e99d547a10280cfc35b864fda1c8b7542ffec --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654e84862d8c0a13f1b5c5321faf51f7e42bdea5337ed399b9d45addce85278a +size 449974 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/999_video_1_CFG-9.png b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..38f4e347989e21a17df32d0b41ad1253f05db150 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd153009051a7d6050189d64dffa50ee778bbb8be7b2d706c53a9bfe104bc445 +size 408663 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_0.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6fb2c016572cab7835e2375646825bf6f121b03f --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:616927f975d1d64f1afaab59431d66b95d17bbbacc8dba5c0cd71be50b40390c +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_1.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ff32b5daa7a3a8b544c9a70e46997f68321e9740 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8189a43b27d5972e0f611f84b6d2b557260e09e171f408ea2fa6850c7325363e +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_2.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..921f14df8b5f5d785ac74378ca53ce3acabd5750 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42cf23c384441856ddd91f54493fe800f68c2ab013f06004543bfd1c096d357 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_3.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8021a9afd6868885c6a312f6d334524d41637274 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22718598fce41e06f2107475a0ac0b9ab72caa49bb11e600fafe08d85fc24bf6 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_4.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..72a9fb40333978e911c43b47ad5e42d372279cff --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e81abb10fddcc00c21052522c6fc38395a88df89f974c5946de1e037c6d6f76 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_5.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..32813385e59617cc085b81cdf3c8d4225225eb52 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0daf14a49d772cd723762329c142cc3c157b342647dc10cb54124c5a146d809 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_6.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7c5e7ebbdc478bfe2b32edd7f19c2f5b2360e164 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe52fbab3d4607faae4066aba2a72cc47c7b49d7a0509345c6d125efe773ea2c +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_7.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c718d1017f5e13296961011eab55aede40df9810 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b36f54cbce2eb30b8647600c3b4110cbd8aac3a8c8d402dbfa23f774dbd19445 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/transformer/config.json b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..76d50854db47ff4100e94bf1647ed3577588dc23 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.36.0", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 5, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 16, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_0.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4462c84737a0e02ebbff949feea76d1e3122f98d --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b6852c3c94a25616636077fca0fd4cf0f6cbd882d4d10990158b5ddcf20eb5f +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_1.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b7784e887700f06d1334c86d3fd30ccd11849dfd --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d023416d41128ba64b8fa650e6c1822cbb01309b5636c003a6dc6963ec7d7095 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_2.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a1343b3ee409b78e246174f32ec065a3fda411a2 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be790108ff9c5a106dceea1a82d5d45f0b302912d5f344cb76f03001042affd2 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_3.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..872cd4163f7a2cba276b0b3230f2cef8d59d3370 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74447bd33d4b847d0ebda14fd3b7614de4f0e83f5c916126bfab01a169aff695 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_4.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..25bd012c98e11565d0ec3a9555e5b211708469bb --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f690486132bfea8d4de638acb5cf533500d5f98c62ad1881deb301c14686081a +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_5.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..292e9264aee4f53f03d374abf2fc78f21faa29bd --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:394230a792b381d9ea74736292de6fa5190ada5b69ef784600b1364628ff6424 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_6.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ba8bcb5ac9ff75caac0d6038c97cf6e0f33e368d --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e208cf30757f65adb3787f73ba034046301f0b2a537809bfd47a81b78e26db3b +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_7.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..51f0426a2c3769684bbec1c7456e0b2623d46ff8 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45f72f3babf98d9908b92872462eca38a478407d57bcfdd609b3520a9649363e +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/transformer/config.json b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..76d50854db47ff4100e94bf1647ed3577588dc23 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.36.0", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 5, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 16, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..55c2b20e8e366a0125592e44ed045dee83256af4 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d563ceca790449229f523d52949b9dca8302cdd1fc4e6202cf9e161f096c535 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_1.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a81ed8d10bf14efe8c6c294874d3a6f4996c335d --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098d78e7ebb46f8b679706e7fc1653a89006c9ad885cba1f9b19a938fe60e3f0 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_2.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ef90bdef724e045ceb72089c9f41109f43a34774 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119b2b4df66c0a551569eac4d36816d58244f247b04a7d605d20a8bae7a61658 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_3.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..afbe712f65ccfa2eaa5f336b6babdbab0faaa567 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e35f3c89099f9272f21cbe10aeecd7980ce41938f9b3732b16c7d2d4032d1568 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_4.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d0a5730cb1b62d7e992ec6dae55af75de0fdf7db --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4378bac8305a7dffa84dfc200e8c4b4db99b19079374282d1cf242a7f6eb4dbd +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_5.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a184059643b92bf70d068ca2c364e756a731ada5 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7831078f6931101d7fc70503f3f4533e3eea027fe090a2751e4122d3b28a03d +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_6.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5d569d5ad6ca769bcd78f735dabc0db7279322de --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5d402dee74bd0fced7ffc78c4295556e32d9bc868986450a4da816e9549536 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_7.pkl b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..af0e253b80cfccbd6a47d10dcc9502ecf6ab8f4d --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a08ed33276b289e623a6667e9da9506c98aa688c20183e08c72afa1abb33a860 +size 16513 diff --git a/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/transformer/config.json b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..76d50854db47ff4100e94bf1647ed3577588dc23 --- /dev/null +++ b/Meissonic/output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.36.0", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 5, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 16, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/1499_video_0_CFG-9.png b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..d62bca398a77b34ae3370abb06c9a589c619e8d1 Binary files /dev/null and b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/1499_video_0_CFG-9.png differ diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/1499_video_1_CFG-9.png b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..9d515824cd840ec4cc777ea421db55149f10f9e2 Binary files /dev/null and b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/1499_video_1_CFG-9.png differ diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/499_video_0_CFG-9.png b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..df56735949c1ee4dd76678a022b0c2fff4428dce Binary files /dev/null and b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/499_video_0_CFG-9.png differ diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/499_video_1_CFG-9.png b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..c949d7f93efb8581eb380eaecaaa4cb13b5e9a64 Binary files /dev/null and b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/499_video_1_CFG-9.png differ diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/999_video_0_CFG-9.png b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..ec5038cf126880965951c4cd6b3c7e782541a01d Binary files /dev/null and b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/999_video_0_CFG-9.png differ diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/999_video_1_CFG-9.png b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..696a127bb2b8baf83f042ca6a79e2c8847c12b4c Binary files /dev/null and b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/999_video_1_CFG-9.png differ diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b85290d022d06adbf4c0ff5b66f299cc22c3d311 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e3a679f226586e47ecc04699ac2befcabc53c0152f7f62932e4499e3e85c6fe +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_1.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9ca338ca74aa21106febd42774ce622825ad4369 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cfebfb659cbc2e477fe070cf3e06f2760f7556a6ece3e5c43c3a7ae17adcbb4 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_2.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a16f711ee381ae978f48824020cf54826f6bcfc6 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44cfac57723166868b770ef6b3eeafe04c4b83aa1e3879abeb74130357c7b96 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_3.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9068a20abf0c58b19f5b254e873414915134c438 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ea0b25f6dda6e0403443c54901d2171d43277aaf943b7a1cdb229bc4be0200 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_4.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7d17716ca164ee1d185566831014c38ca74f65b2 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff9ba57f542479d6cc77064d442c52245a56682cce2fff8990de10958597e198 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_5.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4f3d4c4a86ff3b63f26bc0b35ed01c835e5ec585 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4bf7a4e612f3e4ee2311d2f5f0e3108976ecd349fa20fa82678216103663879 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_6.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9c33a7b88216b8e4564559b54e48e1850b7df813 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0561ce0cacbc888e6948239ae4c269ecd9e0312e90544751f57df894c4c4a03d +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_7.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..58191d9ca1d8ac94e7d8b1620aa5fe77a2149574 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a383512586e729909ab764c530d67805c1fa51e6b252fb24ec6700c7f91d674 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/transformer/config.json b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..819156da0e734b54fa9ac967e46b351738ae1574 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 28, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_0.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bd1b6e0f47edc371b2c10245a9a9ccdb86009d61 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:890250c4ca861a168553e4bb7d22fbe071d5cd1ddeba4b538caf9c98bbad9669 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_1.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7aabca05fa70e0ffb454e792c5e05e88a3286016 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3ef45d57261d6bf4fc88fe327a930a433b39d5cf750d92272c24d86d5f5dbc +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_2.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a8c7ea19b77b5155a2ef3d557d5336b44a7afd25 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30773e39afe05f34a21358ef30c70dd4073b605515cb7f96242f0b8df9c90432 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_3.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..67f78c9ab3085275681aff995eb747d140d8afbd --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15f6440b2d24b9b1f1ca37aba03332f5f4af141f76cb81488e5a7da23884cc0a +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_4.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..31dd28c8a64c2af555ee101030897bf9b79d20ea --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43d8edcb02870952cb5f13fa35ce6a52a54787ce7b9fce63cdfba310bcd492bb +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_5.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4e604f0a9f4bbde9420c963bbc08ab2dbf41b413 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00abbeac252831ccbb02ba49d6b1ca999355b0665f4b2bc3f1729386bb189fc +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_6.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..67226241b467d4470b4f26ca7f953813e340881d --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cd3c2a516f7049fa4ef80b64b0a1777fa8a8f3f8c330828941de0421f53c924 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_7.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0b1443d18ae02fc4cf8c1458405de8b8c7b7ff00 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59d742353eb4b4baa3bb988c6596d4e9fbcb1c74abbce67ccdbf3a6db57b87a +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/transformer/config.json b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..819156da0e734b54fa9ac967e46b351738ae1574 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 28, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..264e88d84f7393e8c3ae759cca3568f61004a074 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b533d48bc0dd2fb205f2cbe893e53e726edd71878a3c540a1b4cfcc5044fdf4b +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_1.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..95bc2b68eb1cdd5a13feaf142861959f7b56f159 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19a65a42ec1f2e764035bad02c00cec597dfc764af52454d0321a0bb5660b3da +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_2.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3026b9392100221f4d179817d44722401e8c72d4 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:223394b7c815ed9f97e9499defb13d27537d77b5fff867f8c91d043d2dd8c565 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_3.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0199bef5633cf0073505ac18cdd0df439269b71f --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e6e0fb6d5887bd052ad2c9eef81fc70dfa91bd186685b7b159877de0d6c6823 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_4.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b625ef8fd6a2c3272bc6cecfedd2649adb15f9a0 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7fdb1c239cf12ac6e9fec014c9f5f582221f13a1fc74a71e643ecc3990330a +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_5.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0268be3ab6a494edcd0771e848a129f786c44877 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fe32621de72eb564e938b0e46b29315bdca5847a3db10b54965aad65474790 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_6.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..62959a765e8887cc250d6c48ce77415d23ccae06 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea9a36d510b976468fb32edfabebc1b1f1570fd69ce361e142c87d7a05ecd4d5 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_7.pkl b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b755ba74fb8ca60963547cf7e38ba15d26bb58a9 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc55de31b54d24bc3a17ba313f3a5da0ed5f3958bc82323e56f705410f9300a6 +size 16513 diff --git a/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/transformer/config.json b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..819156da0e734b54fa9ac967e46b351738ae1574 --- /dev/null +++ b/Meissonic/output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 28, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/499_video_0_CFG-9.png b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..d971e9ef5b8dbf483552c8b967b349f76ffb64ec --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db4b55e80fa0efcd0b6ed76e089fb9643a0ee6b79c9634407f8f90b3ed37361 +size 295254 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/499_video_1_CFG-9.png b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..1540ce38ccb2e32616fdf7cbe2efcab7dc2bee2f --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:109d32738e98e72abfb81e1b9ea4401ae35bc11b1847a0d215dea8821c04161d +size 290028 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8f3c6f4b2049639363ae9f2492549a79a4c79efd --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b525ac54469b87bf42e564db65b732e47d7a6c2fedb73ce88fca951ba22c02 +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_1.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..549e130b8ad9ff81ebc53e8886fcbe99034950f2 --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad84c1f50bf09f49c56a29cf16885d9df40d38fc70bba9f77631d3f50ef88dc +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_2.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3c16847ce86ccdf01deaf825aa9244ce2da65d2d --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d2141a3525fa9d3bf5d9ba65895943f05edb17f79334b571709c2b706fe69f +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_3.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d92f5fbaec603cd84ac629cd08a4a1462ae8502d --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdfb704cfcaab593ac711e4aae77ea619022cc3a4fccebedb99c03e07acd4221 +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_4.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7a9931cd580bd1ffedb74e04e8d36a966008aa3a --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0fb4e58432fcc655d343f1b157786233a6f061ba5b473fac1fc2bdf4d2e8731 +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_5.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..518d8254715129087592113d1d3e0c701bc543d5 --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fde5e3a97ba236ad71911a01094819eb218f11cfa9f3076456f2112b4e6c97f +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_6.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7aa0cd8064ab6963902f90e48eca626fda8f4d22 --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf70ceef26823d0c1cc938492725c76b8dded365305adf21ccc1cc3c93d7de31 +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_7.pkl b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13a36cc7344107c270cb6b23611fd2301bf7ab07 --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c42bf52cb40351d95f96c0db17dd10c33fa94d04693c7e248f7cb5d0de3679 +size 16513 diff --git a/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/transformer/config.json b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7f0a4bc511f641fe80190539a2b0e30bbcc87a1 --- /dev/null +++ b/Meissonic/output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 5, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 16, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..6c57f92a48a45124c8047e429dabfc630e682135 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e846322d8d1fe1da0c06af5a8f74c3afcadf53c258aa36a35d4dc90e6722dbc3 +size 229025 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..aa69874ba98b31a5706dbdae389a2ff9f07d8bf4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c59a605f746fefa06f3c8c0981023f314a8343302ce0498fbb4aebb85b5ea48 +size 234597 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..eab416e77eed30c536341ec9692595b8f1b922ac --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb353e591b1b0dbac38635e86c24871936ac696f4ff7674771a6ef15a513e45e +size 229162 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..288b5357bfd6f44c47c059f1b852223a08d2eea7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/1999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7251adb287026b97ff807bb6fa787d23e29837fb1ab75855513e00dc2679e7d +size 232454 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..db3f8e2868d11f353f5051aeb20b5e5c095b4dd8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880fb5b7bb7d55a411020da08650dc25fe858c588e36c6173cc832251ec76713 +size 234622 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..257d8ee8ee776ff5b4131d710fcfc0c5ca82a250 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4254c55c5a44dae8222b2f94225010db931e57096a09925bac1e768f58993672 +size 236913 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..a66371578a8fe0146839e4faac46604f2d924376 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0af47ac2b0fd0a7b83b953ffea43750361aa0fa8df8302b1b8e87b311f86db99 +size 235174 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..14629585c7a65a8365edf4a781eb7e55bf217720 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/2999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38859ead3b87553090be10142facdc7a4f8780e6d9d0b17eb4abf0bc124b0746 +size 229801 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..eb64442cbc0de5576a61c8404e2bce41579d5f5e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b3f708ccf2664b9bd84c4142e94f63ed1f1d9fba426b1835eb30e04aad78ab3 +size 240388 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..768e69413dff2086b1265e4fd28b894fa044ed6a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96fc2c23d9374b5c001fbbcc7e3cf2abc977c4fd89bff8872af08c61ab92cc00 +size 236161 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..38098b236aed7aa104076c52235d007303a98341 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36bf77eea280b84a34ed04791d44d542d3b4425864c9020408af51961bb2ba2 +size 227665 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..933734f026856731f07931a02829589ce131bd38 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/3999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f60fcf85257e0427cb40bd8076e1243f9e42adfe61075674d62b0d5841bdc6a +size 231968 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..f4c5548f611f4b0aee0776407d41af5a021ad14d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6969510d28d905ce414921aa949d88990ca6b2d81ab742cda228c22accb0028 +size 209920 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..9783534a5783b293f8e0b68a1fe9172cb02ecc76 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b8064a4e25f8ad370281aa9f5364c7b09de3cd29ae7ba63ca51dad6b402323 +size 227242 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..5dadeb5a0b3e104d148a2027d8c36f8c835acb99 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be5afcc9b61ce7cc976552a10997db35e39d3777b8ba46832f7b730c1fab3104 +size 234944 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..e3686752e68ea4903b416f823d7715d05e0722b4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea9441b2526821550065be4282383a91f18e8f289e6f64c5286893c87f4e5e5 +size 235465 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..164bb453b5d4837f8abfc80013b2f5014b1c4526 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9566c529f1e8631fe7010ddbf645c8e48dc06aa7f872a40b8c9385a7bdc90d01 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..69367038ce3bd9d6fd56f68c53e7d5ff52383297 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77756ce712423b3bd272c73697dd7ae47c49aa34447b86fd32df01bc4f600eb0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..72bbb82b7d0580b601482903546aff9aa0f024c5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060700e247197f50cac6c85c06d1a55306f8eecea72d7c191a44debeeb75108a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..569fdb89762d54cc640254be31cd51edd859de59 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5e5cd6ef288840cf99a0434023718ca2b006251831a21fc160f428e10ce989 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..875875d234a200e412b5823dbc565f6179491ef9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d993e8309024f8f66639ce8e5b711a1cfb98401dec6022a4387ee02b43f4b7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b339b270ae7aaf8d0ffb3ad58ac3624ce0bec616 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85bfc5d4039b634d421d36d9dc03d001be120c4696ccf148f56ed20aa43a8923 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..51cf909816d7fc4e026f81c83b33deddf2d3e7f0 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74be3b141f25ef7e7008951fda5e6ca749dfbc8e87e0636b459cdcc2343eff3a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3e7a2ea9ee123a2229ebf9bd143a776cb502c17b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43245b399364e517d280e848e96120f9eb422b6134020a37e231e9be23125201 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ce0d4b559eada128ee9498965d19e0cf58b91ee9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87d9b4f124588c7c5610eea9ce79185120816d458ce0d89c6d98d2c7bd37305f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ef3ac76c4afffa3ad633588cda54a2b6f21c897d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18865050fc60ad91c9a6e434d638e553cf9bcfbc2164fd0b76dd3d03322e56ad +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5a7baf657464e2d038f824893ae11e3ace3051ec --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d3be3d1ee42e5d38967abfa42512b46188589f33013cd5a5c5e87b7e498045 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a675b15015d3f512779f1fe2d3963424dcdb9573 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22cc5f40e08949640f2f9743f400417e88a3921c3d642921e4d7d0e1bb59bea9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..41e6afafc503829e9b36fd01bac1f957362d802c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:117cae4db85098d3476ab68b541539a727d1e5723a6b37fca3883e8231c62e0a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e8222168025e41f28bdbb36b515794037c7d56e4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:954e2567974690b4194f07fce601699608845578eb539b0a2ab07666dd1116ac +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..dce7fc118174d2a0f2cd7942c5fa52eb337d44a4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2185ff454fc47b92bfaa90e8fa7f392120e5cdc4e61e2ec36f0a22135ba6e73d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b2250803db9a9655df5c288bca75a6723521878d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deff986e3c964acb9a1a6b500dcb98001a56028b8f3e5080f4223f40108a2c75 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..502a8538bdce73ec9a972c39425d72221198545f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f78a760c044c306d995aa67de8e9c659165b65fb01a446767c1b423707ecb4b9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2c63db09605b3a5aa00798b597344c59393234f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b9769e73bc31f3805f2a3e0e04421b320adf0412bfbc8d4a5de41fc234d33e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..283438fccb7dcf8628d9ae4eee7294c15f8927a4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84a073f9f96aeaf68c6a3880dd6611a7e640d6ed50e8aeebdfe9b41ea261826 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..eef059109f06da19cc479a4cbc56e8fd5bd3a730 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e278b89825dd4634998ab5f54d6763ce7915c2fe7a91ad53bebeff322b709e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9a997745438530b06cb5fdcbbd0e57f629790e3e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885d2e96331d93b60519171e045b30e4a0802ea79c54c82b74056e9bcca8f2af +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0c58d4064fe2c66305c7352679653086884a9f7c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e715d5583b6e5eadc5ac39372f64d3579843977b33798942657213e68c173db +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d8b4b2a57b72266a73ccff79cf02530b7667e112 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d3965d3cbebbcabad3e980654446dec7d75b6359e0717c357c8a2fcc95cc07 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c7fb992b8006551ff736ae2b01a800f0ff86a4b6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:494b93cb9f1dc372f43a49427a1b7d4b46b27e3ae0d01a4bb80f0ee3df5fafff +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a86041af61d91c209b19e3a8e312feedcc872270 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3030ea97b337a18bba7f203c01d67f07b561543644c0772887358d788e34974f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..af70b1bb1cff44dfcefd86eae8de6cf0e1f7f6e6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151c7aba50178b9bdaf3d5663c4715364d98d9685b6a00072064ce2b275259cc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7c8bd1e6328c3bb24052393705fd5fea04a061f1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703cd9b0b9c424250441c1436a5e6dded0e2cf186c9a052d92424436e4d87b94 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f8570f0174bc89f1b636d70e57800a99dd6af9d4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:899f4193794df4e24a2a1a61c8fc06f02192c6ce07b6ac4196026f9544e905b8 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d9a6c7c34ae1fdb5933f07cb2a702191be6fa5ae --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483f44759a4e50f5ce7ac39c8678c8c93f628d1311ccff4e6512bf188662a075 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9c17cdd03de2f3b9e7808bae8b96909b34b1278b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6eafa2e7dc1627e370b4dec031fefbe457c3417f34f98a5662f1211f5a734ba +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..95f1d902eb0c6745d620bfa3dab906dde880dced --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6245dc47d5126fe0fa5261dc19d6dbd9b178d09387221da232c6f279e9f51f6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ca2be54bb8e1af595d3401375a633fd30d525001 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ed24a37d87eca08e9700d11c8407cbad00b1bc3ec8f9c0411d15a18986a6b67 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..aed20f02c3a58e79d9309e60a42459485ea17d62 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6faff616bbae42d9f802b58416c0a1c0cc5991fd21133de2f37dd8d4f787385d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8472df94ff3200f774bfa13d8a6eb7d4fd217668 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf04eb34f3f4f6a05da0934555816e26f4b98808ae0703133a727404db7e479 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e7cc600690627928eee481d9dbf99993a8d97506 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0943a4874d0a8ad2d4a940b2e5e039f0e74c1efdd515d317d5ca7eef2864bdc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0e27414190ade1b1d806804bf811b9f339e75699 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b914ca6c7b859f93591e68c43fb0260308ed5978533ccf1a35d28b6f3a27b337 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b8eadb7a20a15a19e7ece7c49cb5928f66bca24e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c9ec8768c2efd801e4bc51c46ae83169cb26d1562ccef1c995bd746a731431 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..06dad9f35d19a507baa7f9b6e06f6dc097270f33 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4f41f62bbe293fa6d3a33bdee42a1e2edebeaa6190430d4478cd26ceac20b0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..69ea244d3cc7984493d6745a56bb1d2b3784bc47 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ad2d14f157ee9811e396ca183bbfe442de36b0bf8f3294f0101626b3a9ccd51 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..773575c38b6a7a44fdeb9a3c0981e3c0f3f7e62a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59b74cc5055c1386a6cf5bf345695f5a79fb20f7a2e37f1a4e08f7770aa9fe6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ae6310e43be9ff986ee32c2d0cd65b88ba20d846 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d4abe3783ac35ba6b3eea431c0c2f4ea9113ea1e58a34de7c3729b246da033 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e3f50248d493f17cddc507f1c7ad3329d9b6f39e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958a10c0f41176548f6b53e64429ea0a9332b4628c28eea29cf86af8670efb96 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..50f9084c931b316e789e5483fca278aefc11b794 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:063607059e9f5f804d2695ffd28056bbb167f65239e7669faf0baf4984e2d119 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..be27619c6e84fc56dc57c6dd4d5f378df0881bcd --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe371823ec4bf9047d3e1c17a22d5b713bdd1c1ec93023692f36c1bda9517db7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..549f40fbc8630b4a8afcfbcf883bb447e4afa168 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db5939cd80157fd9e7bc423681ca9d8c5db02b1e226662947a3270ac5904ecc3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3af4fdf8c47413a7ef53531ebbf75183e6d3f3b3 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc3830756ac23805a9673a00a10b4319c90aae953622f45c78b1121a1e5e54a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..86441127488da68f6bf3f9a6677392f1f64ffd63 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb2212dd9db7565c5def9306836a7c4bbc4b47b3e486f9c59694b36c61e4c74 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..83176eabef60c5a8824a78e40276195baa3c2967 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d6dd8f7da22e1026f400727b4dd4c2e79276c0d11c9eac20a979847b98cea2b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8ebd2a2c6e0329b0d71929b6198b645205aed6d8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9522dd6d6f629044f2292424bc28d8f77e38d021d948b7b3f1fb9bfcb7b2bd72 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d08282022fc174f250999c463acd46b070b6fc96 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4aeaa326f5347c9612f364dae8b6ea5fa3d077703ceca4b88ea8ef22d8b0d44 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8cb388285d11fd1fa54b392ea00ba6b875ab2fc2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9884f1f246383d938505e8f2793ebd1c6fa5a733162416c20e07d240cb105c4 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f81bd664fcb32598e4b23e6e474f9e0c7e688891 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f80db84d8658809eb4e0113a57b4a32d6674af3902b23835ff5734fb0ba1ab23 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..86301b3a66b90b4ddbd5df4eff91c3ba08bba0a9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47780bc7316848c7b6d95389ee9d5f3b59271bc8fdaa0143eb7f64cbc8f43acd +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6e2793d2f9777371cda392b5a657ebfe1e3c9dac --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f866adf4a0da01caf64e01751400e0631d924dc301abe2329b634872827470 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f73f8516be54c8984306764a09ccf50b3f49fa90 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5bcc45e6b8475762f2483530cc9b31e33ff7218c2e7d570065164efeab5d40 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..164c3a58583132f2b88f0a2811c78360cd8291bf --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea2ae81986ebfc524b63d14e0b1439a392cf0a24f36e4b2239bcc9ea10097c59 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8f3c6f4b2049639363ae9f2492549a79a4c79efd --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b525ac54469b87bf42e564db65b732e47d7a6c2fedb73ce88fca951ba22c02 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..549e130b8ad9ff81ebc53e8886fcbe99034950f2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad84c1f50bf09f49c56a29cf16885d9df40d38fc70bba9f77631d3f50ef88dc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3c16847ce86ccdf01deaf825aa9244ce2da65d2d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d2141a3525fa9d3bf5d9ba65895943f05edb17f79334b571709c2b706fe69f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d92f5fbaec603cd84ac629cd08a4a1462ae8502d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdfb704cfcaab593ac711e4aae77ea619022cc3a4fccebedb99c03e07acd4221 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7a9931cd580bd1ffedb74e04e8d36a966008aa3a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0fb4e58432fcc655d343f1b157786233a6f061ba5b473fac1fc2bdf4d2e8731 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..518d8254715129087592113d1d3e0c701bc543d5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fde5e3a97ba236ad71911a01094819eb218f11cfa9f3076456f2112b4e6c97f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7aa0cd8064ab6963902f90e48eca626fda8f4d22 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf70ceef26823d0c1cc938492725c76b8dded365305adf21ccc1cc3c93d7de31 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13a36cc7344107c270cb6b23611fd2301bf7ab07 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c42bf52cb40351d95f96c0db17dd10c33fa94d04693c7e248f7cb5d0de3679 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/1499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..aa9f132b4e3203ba9739eb46384a8de251dd3b29 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/1499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb32391d5c492e093a1a5181dfed7a57173d650f41a8937045043a6654a4584d +size 144814 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/1499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..deed70824d03d140b7743b739ff085d9ed8a85c6 Binary files /dev/null and b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/1499_video_1_CFG-9.png differ diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..6491dea77af7d8a9416908347e2761164abe4181 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c619bff47ae122a5247b1528f8a40c85a112141a65dadbc62886b3a9abcb83 +size 127496 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..996ffc4309be4eb10301a052e03e5dd8f070cf42 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a388a1a15b60d9f44386c6a939bac94001713fd8b9c2583f78b4338bab330ea +size 120118 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..67fd9ce0e135bc0a8d01b659eee676a6b43c8e9e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a764e89458c3c8d15fb1b993214628df564089d2d2d0b6e6f304b5042a5b4c2 +size 153669 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..c9280a89b8e3b30354f265404ad27523c27e05e9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80cf7f467b6a4ea9a5d477c4008984777d1b6b4422f00b945cdb847edeadabaa +size 143871 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bc9d77ad9e0564f4cac00be5781e2752dedb13ae --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03376fd68fe1c21212bf18b03efde0d81c6bc4f3baf415c0dd62b02865c43d85 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5599bd246a144d2f88407bdb9b6aeda3a20af93d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3818dd99d544bb4deee88c116ac0824e392e44e5a9754e929b67a1f6644b7314 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fe71b6978d3a2d73cb39fe0c77bebfba9cae9eac --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d5805cb7a4683688bb5fa623b8b3a8772028b31e2a940d1b81dabef654fddf +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..87c07807c377cb8841969db29a08689efa417980 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e9c1d900abb6376af2e0b3014a6427e441603b3da4f2a47698f9c409c77b21 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..baf0b1e05d8fbafb8c2f1b46fd051d90c8d25b69 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b448895c95d5f22d946b73d78882765dfb4b89e61e1072f8bd1af0bc36acd46 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..28288c5f1e3524f0bd1800efa9a499b1473cfb40 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:112fcefaaa59e90f4ed8b49601a40914870531b7eabedd5aae07730d05e00fd9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..da1041c8f308418db179e35ca765f584526b09c7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5701659e1f2215f6e794ed6021f4af45a515566da0eaf29a9ed9a1f18a23228 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0220a1912864a445affa6d15e2242f199b8375d0 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94aacf663c6d29a331041653f28e5dbd80a8da96a749eba6632a835e74389046 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ca78152fcdc3dad42e3711150203dd6295f1630 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/transformer/config.json @@ -0,0 +1,32 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "_name_or_path": "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..032c837714246cedef2dc3fc22c5cc46078374c8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:022b56965a971b3eb89bf4244cc27364d75c95c659391e3877a6acbdbb02ebce +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2774954d2d12e0d4f59541ef8b82235c06acacd1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21208c76fcc96301c851405eb589ade68e1c91432e69064fc344a7834748f507 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..619af9a81b9dd5c2836686170d34a57a3a9fdc5d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f07c7f9908bc800d1a4dd4a476981959fcbe40d06c5ab28cc476a9407846aff +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1570de9ba532f431442a9e89e893d4eaa080da76 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12c8fdda78590d57e02a473cfd8952976ffa8eafe281bff73d000a4307a79d3c +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..84b8bbe114ad414e16062d7d6bc5d54f3b8e8279 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa42a9ebadaf18ae3095600d0f57b43e5085e782e63efd0fd9df34fa3146bea8 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2682a946d491feba3f308ff85ff6cd315a3b5aa --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9243231867117824e2893c7c039feea2d2e4194adaae132f100d19ab576c98d8 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8098731c4b99e2ee8ad145621c1094f1683b928d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8fe33a42f5edeaee4aa58ba6e6a3b780fbcc983c3535fc6effc0ce5c1f95bde +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..051089f49f18602f93417ccff50ff5544e2e746d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c5ba47b2652d71ad1fcbda95fa19dd8cfea5a5b9e37d9ecfbf8711e1deece9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ca78152fcdc3dad42e3711150203dd6295f1630 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/transformer/config.json @@ -0,0 +1,32 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "_name_or_path": "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..afb0d8c4b7e2b5bd0175b87f1abff4995e1211fa --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0258873f8004208d80202192d559fa38ea0af65ec95889d296775771962dd33 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..241c9b0eb4c1136afda5cfbad12cac3b7f40f14f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a2b293524ca7d310c3b37f63e00a624a1727808c45acf12349d1d0ae4f8259c +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8301199f095ad1698d990d7a37c99fdc50d416f7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d9753477d4ad481fdbe453559f7b5198440eb602b65b14dbaebc630a2f44ded +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3396c9e24d0006753890611b3a755d1bf255ccaf --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aaf5cf41eb3324fd697c4118f2b4c181535bb4c22ad185a82e9367826833f70 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7f51bb71f3fb00559d91090adc98f6c6e70c2f77 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47a6ec7b1df4e0cb8570d5092103aaf31a2ac09e901784fd3cdf2dda874b28a7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..36aa5b387900e595bf9b74ca8c93d3110609d356 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6f0f843660c8eeab2c1703a10aa30db508c4ced59a00bb7417536814bc8ef43 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9fcfd0edf8d4099f6b6c5afc7770b537822d8ac8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa29d79bbe4566adadc36a2777ae1a85c329776f7ef82b103797b4aba4aa5e5 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13c77cb09124b1cc50737ab44c3a9d1683f3539b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:248542aacf906ba570191723ebf4bd2047c8310f21120de022c9a360ce4db8af +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ca78152fcdc3dad42e3711150203dd6295f1630 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/transformer/config.json @@ -0,0 +1,32 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "_name_or_path": "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..d7280ec4c28cbe24e7c9af972e31ea0de2755f94 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b18ba278e34d44baabc384a855c5179e8a1bada0a2a0278bdeb1ca432f1ac5 +size 153161 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..446ff5a0efc460653a3ba2816a1aba0f75f004a8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287117d5d7643ba31ec4edb4220888e3e3039632e354ab390fc18b48722eeabe +size 144805 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..9588194ccc88b3fb9f50423b2b9102d25120cf06 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa7af05465465675413476e517f43ff87b883cbf3a4bf2608c4081c4d58d8929 +size 153715 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..062e91e05b7b5d2f115315e2a3cc6fd2273eebcb --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/1999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:321720abba124381620b47e9787cba8eb803ad1c4072172fd3800750374d7536 +size 142016 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..3460543dcd7c86f9810c764bee6527374572eaa6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c1efef5a74bd11c5828c14cea7fa6bf3a30be1e7802808372a2a9a6dc70570 +size 146709 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..9dea257094b6aa0266a742ee5054d82139931f79 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00b3e2c752ac3cf926abd1ea3f05edd8971039f15c2942bf0f1b0f461474d20 +size 139409 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..b056ae0ee00d37c8788b9b287aeb3c3e4750a57c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d5ba7897e123897b95025d14df4dec47f70dce18f06b438e40a2d23849dba2 +size 149238 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..cb792e41b902b2ff19190ae6ee9ec5107b3b2152 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/2999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c128d777c7dab5491071e2e6a43807bdb349d7bab1d8f8426085cc6619d8446 +size 141809 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/3499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/3499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..f6cdc3dcb18bc50f8fedb6536d2dcabcb5c7c203 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/3499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7aecbbb4729ab5af9d55510c945bfc145abcc757a87d55a726687e2aca4841 +size 145540 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/3499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/3499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..43c10cec82a705101e7be098b137740f7bb33d3d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/3499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4274b237825ef8cf5d05dfb41e4ed36fd5fe68ed60a35d9c9a8f93e75a3e671a +size 140719 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..4279bc6a87471945e644d0aaa512d4e46b42424b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fc778d368d5c2cb79c7b208b465f91f0df0b3653679648d8f4ba8d3d0ebeaf +size 172462 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..39ad914a2ae6a66a4c753749ab5b532aa800f81c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09fa45bbfff36049e1418e1c56d43cf7cde9f6e74b57269422bd9132aee4b135 +size 162875 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..4a80cb85e634b71c019cb30a98026cef72b6de7a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b01b8e129b539a85ed3a366f5dda9d8a2ddeeaf4c86fdf85666b1fed702412 +size 151513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..fe9d0c71e1bb9e1427dd51b01336e7555f94cfd1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2c7dbea7c77c3a35238f0fb5e799c84a960eb7fb86424199551945ee4bd12f +size 142225 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6044a829e97343fc989a7776526ffc69ee4404df --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3390ad3b655719b614d35faebc05baeb9f18908e03d2463560337fbab9fd5b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9ca338ca74aa21106febd42774ce622825ad4369 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cfebfb659cbc2e477fe070cf3e06f2760f7556a6ece3e5c43c3a7ae17adcbb4 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a16f711ee381ae978f48824020cf54826f6bcfc6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44cfac57723166868b770ef6b3eeafe04c4b83aa1e3879abeb74130357c7b96 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9068a20abf0c58b19f5b254e873414915134c438 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ea0b25f6dda6e0403443c54901d2171d43277aaf943b7a1cdb229bc4be0200 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7d17716ca164ee1d185566831014c38ca74f65b2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff9ba57f542479d6cc77064d442c52245a56682cce2fff8990de10958597e198 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4f3d4c4a86ff3b63f26bc0b35ed01c835e5ec585 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4bf7a4e612f3e4ee2311d2f5f0e3108976ecd349fa20fa82678216103663879 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9c33a7b88216b8e4564559b54e48e1850b7df813 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0561ce0cacbc888e6948239ae4c269ecd9e0312e90544751f57df894c4c4a03d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..58191d9ca1d8ac94e7d8b1620aa5fe77a2149574 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a383512586e729909ab764c530d67805c1fa51e6b252fb24ec6700c7f91d674 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7cdea23c610b1a125c3127ae614168839a6ce681 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c22fd17dd1ec876331c03a6d2fed8e7638b5f1604bc6cde5ec88bfe5de56fe5 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7aabca05fa70e0ffb454e792c5e05e88a3286016 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3ef45d57261d6bf4fc88fe327a930a433b39d5cf750d92272c24d86d5f5dbc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a8c7ea19b77b5155a2ef3d557d5336b44a7afd25 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30773e39afe05f34a21358ef30c70dd4073b605515cb7f96242f0b8df9c90432 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..67f78c9ab3085275681aff995eb747d140d8afbd --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15f6440b2d24b9b1f1ca37aba03332f5f4af141f76cb81488e5a7da23884cc0a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..31dd28c8a64c2af555ee101030897bf9b79d20ea --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43d8edcb02870952cb5f13fa35ce6a52a54787ce7b9fce63cdfba310bcd492bb +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4e604f0a9f4bbde9420c963bbc08ab2dbf41b413 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00abbeac252831ccbb02ba49d6b1ca999355b0665f4b2bc3f1729386bb189fc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..67226241b467d4470b4f26ca7f953813e340881d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cd3c2a516f7049fa4ef80b64b0a1777fa8a8f3f8c330828941de0421f53c924 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0b1443d18ae02fc4cf8c1458405de8b8c7b7ff00 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59d742353eb4b4baa3bb988c6596d4e9fbcb1c74abbce67ccdbf3a6db57b87a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8ae1a448a709ac4aab4bea68864136f6a2ae0a7f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:461932b86df147208cf79d8da4bf45b736a38866ecc556c6231f6c7bb1546e2b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..549e130b8ad9ff81ebc53e8886fcbe99034950f2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad84c1f50bf09f49c56a29cf16885d9df40d38fc70bba9f77631d3f50ef88dc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3c16847ce86ccdf01deaf825aa9244ce2da65d2d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d2141a3525fa9d3bf5d9ba65895943f05edb17f79334b571709c2b706fe69f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d92f5fbaec603cd84ac629cd08a4a1462ae8502d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdfb704cfcaab593ac711e4aae77ea619022cc3a4fccebedb99c03e07acd4221 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7a9931cd580bd1ffedb74e04e8d36a966008aa3a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0fb4e58432fcc655d343f1b157786233a6f061ba5b473fac1fc2bdf4d2e8731 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..518d8254715129087592113d1d3e0c701bc543d5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fde5e3a97ba236ad71911a01094819eb218f11cfa9f3076456f2112b4e6c97f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7aa0cd8064ab6963902f90e48eca626fda8f4d22 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf70ceef26823d0c1cc938492725c76b8dded365305adf21ccc1cc3c93d7de31 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13a36cc7344107c270cb6b23611fd2301bf7ab07 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c42bf52cb40351d95f96c0db17dd10c33fa94d04693c7e248f7cb5d0de3679 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f7583b11e70125107097537288873d35db299ff4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccdcd07f15857bfa0902673df610fdcc2ffba523ef9ee3fed314543e1ea5b3d4 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a726974eaac38a779c51b5e818f8e7e018880b7c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d38326b926de3fd6c654d9051cf194534892acf564c640d8ac498e7084eb8a2 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..641ebc0c281b25e5ab0f9bc1632ff8a3ae57ad6e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37d3305d71f6f3f3dd64f705e7938f481a8a0c5da21354de65a283d2ac8326ba +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1c2fa99d464af432879edb61ac2431d65ae0ebcb --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85648f254b3aea2ebc462b766032de1927962d1c5560fd50e209b1a8388df502 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..583632874e35a826068637dac5395e8c1fd2a6d6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df31fb70d2dbf0da545de990999aeb26d0dca28f179fc93b6588e6a261d63876 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..de9a6d96536d2d73ab3e75efd76ea249c8beb047 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:428c7304d7bfee6a7c8271460ef13cbd0d162b3318904780c5311621cdc7426a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e4b7026edd6fa78464dfc8975dd41de5d080b591 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917ab2696a14196396beb26e99e6edc3f16c0f5212afbc10009985860e011ab3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b7c38e118b8a0d5a557a3d3430a906619111022c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeb02a2eb491828f0f066c7bad6610054425e6ed72ccea20048e24d49f060de6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..81a8fcdd3e7ee309fdc1cbff0291555da5f3bcdc --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29997944a56dbaff277faf9e092381a7f2c5dd5ca5be9a88502a605959d72e1c +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a44a1eb91417e1f9e9bba1f52cddc946f6f68a27 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6278ab50bff7fd19fb6047c5eb64662c717995dd2cae8866b96fc68176a414b7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9a601880010a8a5f5aaa228ac466ea97bcedd787 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8116aaea71e0dd554f7ca70e33a1a9863ccfca819d66b1f77e6050c65e4b48e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a12e29d3c45336e9a29e929416d8c495faa6bd8f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa318515027ca16713842e437bc5d0f609b8bc5f971349a8f75e45cf17ec209b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..34922f9b9cbbc0e2662418e9839f48e7c505fddb --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a609d4eba70cff92f20c052aa8131388f849fcbfba044715661ccacc1c70fd24 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b162ce844e8bb9b666f87d5b264b256ee108325b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d6e7cc24a40cb1403a904f725f1225fe4a0286b2a1f09880bde2526a8fdffd +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ed9559d4c5ab5bdb43b7303d7927d9b859ecd4a0 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c017a36e1f81664906d6e716e90b29ac7a0ef984ca1582577f32d302421df29 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..be21cb79ba8ed433b0e0dec204ccc537caa075da --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d962f9a1f10f0b235f802af87c66419d6744dc6e0e06617967660af1b519a9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a5f3f08c60a5018859dc359a8618cbe0b9325868 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2bc8e8e27768ad73583a19e31cc732da9d688b2f8773869498504ad37aa185b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..937b15f94fbd477ff5625ffd6648bfd9d7e7c835 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bb52b038ec31ffa152fcb803729021bd4ddae3cbfa5e9aab0e8458e1f81991d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d87b8b95a36ef8015725b74b2b5e92baf0ddc18c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:120c2ed34eac465b593d02250a9d7ebd75fd71c2bf0ba9f5ece02f535a536ee3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2871fdbe2614c5f1c007a8546e850a969fb283bf --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c299c19ec6bdf661dadbb03438802041e106057706aeb3fe1013e88a73f0b490 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1fea92cf82ea755c74163a8a9fe8a6f2f4ec0341 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb42803d256f6b62bbb46eb534e506ff3d39e328db78004e184b820ecf13bb3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..39ba698f8f771e793a8daa1558d60c14450c2a2c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4112adb7f7481fcdf5a0443aff0ea0cf36d0fc99ea84ba0d9cd473e70b2a71d2 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cf1a81112c8b6902fbd637fc2d0a279f1bbef2d5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eebccf66daeabc7061a3a812fa861d6316376025339c7914735e4161937db74e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6438909b6214fd608d6d66524c7d7a31baeff3aa --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:434838515684043cc1164af425bc3bdee1e18daed601ad43198b6ae31473e1cb +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..264e88d84f7393e8c3ae759cca3568f61004a074 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b533d48bc0dd2fb205f2cbe893e53e726edd71878a3c540a1b4cfcc5044fdf4b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..95bc2b68eb1cdd5a13feaf142861959f7b56f159 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19a65a42ec1f2e764035bad02c00cec597dfc764af52454d0321a0bb5660b3da +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3026b9392100221f4d179817d44722401e8c72d4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:223394b7c815ed9f97e9499defb13d27537d77b5fff867f8c91d043d2dd8c565 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0199bef5633cf0073505ac18cdd0df439269b71f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e6e0fb6d5887bd052ad2c9eef81fc70dfa91bd186685b7b159877de0d6c6823 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b625ef8fd6a2c3272bc6cecfedd2649adb15f9a0 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7fdb1c239cf12ac6e9fec014c9f5f582221f13a1fc74a71e643ecc3990330a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0268be3ab6a494edcd0771e848a129f786c44877 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fe32621de72eb564e938b0e46b29315bdca5847a3db10b54965aad65474790 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..62959a765e8887cc250d6c48ce77415d23ccae06 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea9a36d510b976468fb32edfabebc1b1f1570fd69ce361e142c87d7a05ecd4d5 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b755ba74fb8ca60963547cf7e38ba15d26bb58a9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc55de31b54d24bc3a17ba313f3a5da0ed5f3958bc82323e56f705410f9300a6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..3be830fede518a8205d018f3422730b27bc68f08 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f047fb97b642dc30b33c713788d30200b71afdcfb75e30c504fdf6c6a207f390 +size 163793 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..bcc8d3c0fbb18c2aed90fb5ab881a79a74a90a6c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c061c65a6ce343b1660e7e716906c76632561aff9ccd459c06e2f77a6c6ce023 +size 151206 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..96d8972d2c2014040d9269500a147c781820c929 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e98ce360ce92d75f9a36beceaa6a8e65c74d6190aa53a74c43f7bd80ee9135e9 +size 160429 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..71284bdfb848395c693268f4a58f7c484b3ac992 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/1999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2805ac51dfa6ef4de083f749ae468d53d2e04e1fe1f53862f0e92a1b77c77c0f +size 154531 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..527bff166840b69be09c32f54b33a8afa7349421 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecca4db815beca263f13207354f4e3e1b9cc6944e4a79a1a08edaefebfa40c19 +size 164005 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..7b859d41e223d88cfcc30a0d94543156bd92097d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430592107b01c838d95230455d6237687f51489c736abc6f4a54a3a888b2d9d4 +size 151819 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..7804b818e67765833102006e8205c3f9a2753821 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52422bc6ab7caedd5b8c29f1057d92a2c74d6c38fcd27ede46ebec43ff8cfe59 +size 106641 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..a8bbcdfcc00d28eb2ad200c7ca671522dc63446b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/2999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e35b9b7d6b7a08065532353cfef040cacdaa06a696822639c2349c5ae229bbc +size 128321 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..a7ee239599090e349a38a61feeee19ca1e7d5600 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ba9221da0bf3c4919028ee3c41da6922b3b52403bdedcf340600bdbd33f512 +size 155417 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..73ca44051bb59bfad72ceb656c79c9743ff4b68c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:227067a6cd64b7cdced40c483f0da283047ad4d0df910d1d6deeeb57272326f4 +size 154964 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..c8652ab59f8567c725658b38071a10399cfcba7d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a50d3903fd31767c61693491fe8690adbe9040c7f7f963a469cda72c36ac636 +size 116216 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..ea92180801509d5a83faf01f800be416c6a17141 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/3999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffacfcca81b53cb27319a5287eebacece7bb2bb70ecf2770cbf90c249834bf4b +size 137536 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..60077216d24148c8123a137ea44ec45536c6b8a9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935711ba29b3ab6136917a68834005a2c21d880bbcb78d211043cfead73f3db1 +size 134205 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..8f68427d9557ffbe515f9e696aac833d8769bf7c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf885e1339d92cc386d12ff933a6b042a9f83815a2f473435cf3337ae3623e11 +size 152408 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..96007f6d41f66705495c18834570ebc055c0c490 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd3a8c8c0c8a7a7d4dd9c5151f6f7649ed6527208d08f9f8255652e8b034f16 +size 123734 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..8e5214a6dc17030c736e893a0547d59bed4f452c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/4999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8333d970fbc70e45c642cb23a3e7fd7438ecce32d0be670b3a387f953daecf1 +size 167486 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..b0df40b09ef1cda6b9cf9273f58db521b78572a8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f06ea56e9a9c08850cac9ed8053164b762fd6419e22952eb7b1a36ad1da224 +size 148943 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..00d6645819dfdd4a9693792021c6d5ff74a0ad29 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d483725c07baf8663d3b6db52b3269fd8031f2ebebafda06080754ba5683d7f +size 144108 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..3d134aad94d1ba4ca786fcb26757af5f22e0b782 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d433cf43a3cb8d141289e76d86186829d37e199ad24b3cd9f314d064683b81 +size 179518 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..508e841842459242c459c798515a257d3fc8702d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cd8c962e4d1b79b5dcc9a9fffa54e2c2c3fc7704df985575327c2ad146acead +size 175060 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..28bdefd5622b7e455c278960cc77ad2d4bc7001f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6c41d57fcadc12fd69bff3e72eec586a0132a0fee1b3e93dc6c6f1b5d2c41d1 +size 164647 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..a80e2aa0aa0e3679562e074024cec89b3c3a9454 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/5999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41402987f48490139945c8630ce10fcfa3044835d82af955b434a4f01cbb0be4 +size 132213 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..0643baa4f180bfb355192421abae8ea5a0429f31 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d21e8a2ea1688bffb9dcda19294c42221521cf3c627020900931bdae73005d7 +size 176092 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..051836d358de492344ae4844304ac614265f51b4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a609810c96cec2279a467c884847027bef702479b858c86a3e752d4823f68f25 +size 126319 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..cc30282c0d2ecb0f9c80794ee74c8d8c605a4bf1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a1fe2fe98784f7b88413b766592c5a488a59591b64999a55556dcfa562776f2 +size 147009 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..98ced0718ed5d975db5b40e88bb1324df7e18cd2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/6999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6119b9f39242430c319b0daceaec879ae9ad9083e344212bdef0e1cd151decc6 +size 136868 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..352dfd00189ac53d2a35c2f541a7d11ed76db53d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01a808f5a897296f898e99594f2fbe5296e949754f2c18ab4c841fbfb3180c1 +size 166963 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..d9f84038f5553861fefff104f431fa13a67fb55a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0ee18074e9b8d85c455bb9361748e3609fa839ec598d0ba23b72c54cbbd32f +size 161518 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..f7f4dde7d9bbe7acf63ef12ccf92771deb4d73ab --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9652b904aa757dce7aeb7b829d3941c14c3be1223c9043c20ecbfa2adeb780c8 +size 155441 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..203875ac6f102c9e2fa58ad8be4e7af0cd73898e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/7999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15555bb3e2ce8b16ddcfa20cef56fe272f4cdba3ee6f67eeb67a180bf2eaba00 +size 138955 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..2d73374aa67fe4facf3a3a89ad1aa7063c54a85f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8499_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2d1b91c197ca101b35012f0071df85cf87e090310cbb4ab1867e35020df33c5 +size 143284 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..01ca7c32cd50930ca9b996de5f6a3e8d36c9db2d Binary files /dev/null and b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8499_video_1_CFG-9.png differ diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..65a846e6f728e7d27743b9f683fd40cff01f8f1b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ba3747205343bd9935e61ab2385fe52d07526e752022a0229a955d695ac70f +size 176256 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..46fb24357fdfa794e12e73fae70f4eef89c56781 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/8999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8a6153b15016f58ad3b04fbbafbb6d504df8a5bd258ccc404656e33977d223 +size 137044 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9499_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9499_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..febe374134f9679f855543c80236f716c1901c15 Binary files /dev/null and b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9499_video_0_CFG-9.png differ diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9499_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9499_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..bb62e002af8d68983844dad322e0201440d38faa --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9499_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342589ce9380e8bb866b85d1c53a018795105fe91a580b442f2bb385178278ed +size 166363 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..c2aa59ddfc654ec9f82d4c9abff61590cc837f39 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8328d2d0556a95ff2759ddf86ad4dcf2e293d935c74841fcc2a3b4a6bcafa2a5 +size 143482 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..1a5c6ceb734f615b46258dee56e70de2cef8ee42 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/9999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:980ee3261a5cf9cce942d12e4e4729bf254c67eef014ef3b6cf421a272ab480c +size 157765 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/999_video_0_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/999_video_0_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..d71a3c7a84b2a61722a420daeeb652249dd5c59f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/999_video_0_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd26361f0705a90a6329c8a9cd00e16eae894edbeb49608d76a9da33ccdb3f5 +size 171306 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/999_video_1_CFG-9.png b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/999_video_1_CFG-9.png new file mode 100644 index 0000000000000000000000000000000000000000..4c0a3241e815e1072e83a7c1930add5724c2e6e1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/999_video_1_CFG-9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec203cb5c36d2873217dce75eb242633b0d4fd0aa6d1b1a5ae91203cc511aa7 +size 154952 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..164bb453b5d4837f8abfc80013b2f5014b1c4526 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9566c529f1e8631fe7010ddbf645c8e48dc06aa7f872a40b8c9385a7bdc90d01 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..69367038ce3bd9d6fd56f68c53e7d5ff52383297 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77756ce712423b3bd272c73697dd7ae47c49aa34447b86fd32df01bc4f600eb0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..72bbb82b7d0580b601482903546aff9aa0f024c5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060700e247197f50cac6c85c06d1a55306f8eecea72d7c191a44debeeb75108a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..569fdb89762d54cc640254be31cd51edd859de59 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5e5cd6ef288840cf99a0434023718ca2b006251831a21fc160f428e10ce989 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..875875d234a200e412b5823dbc565f6179491ef9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d993e8309024f8f66639ce8e5b711a1cfb98401dec6022a4387ee02b43f4b7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b339b270ae7aaf8d0ffb3ad58ac3624ce0bec616 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85bfc5d4039b634d421d36d9dc03d001be120c4696ccf148f56ed20aa43a8923 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..51cf909816d7fc4e026f81c83b33deddf2d3e7f0 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74be3b141f25ef7e7008951fda5e6ca749dfbc8e87e0636b459cdcc2343eff3a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3e7a2ea9ee123a2229ebf9bd143a776cb502c17b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43245b399364e517d280e848e96120f9eb422b6134020a37e231e9be23125201 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..24e87fc868dbd6ae5a8e07de63d2648235acbde4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22ff5d00f25663b2482fd6e6065a59eb3c4b3f91e830c684ef112a0eaf9f13ea +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ff8133f6dd0a1d8391b8912a768a3afd395e552c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:206d1c5ba8c081c2f57f12e4ca8b90e2b3ca1911f5f0d128fdc7cb1571668dce +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..aa984a9da801fcaff87b374747f226280a938156 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8278ca046015c88d3c7f512c1497e6a3220f7bd4d69c7284219487dabf0c374 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8b381d596302937bdf130ce9adff86d6ff96923d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:406c9b3cdfd94d1787740863e4806cd2ef5f0f7b07fd04328ab7eb7d2df1056f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..09e4b84c64f1caad6e0f361926d2dd022ff9f7f3 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682e00e32bac9fb4cb9cb68bea2b48495aea02e76f1a746e32908fd76bcb8af1 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0ee7fda90f33cf9245f5805108b17d2fa1f904c1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8445480bdfd86db868d8dfa4f35357b41723777cf64e4863aee07c6037119d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..62e30678ea1c6e0f49d8283bc2971f55d893f1cb --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51db3b4f941329a8368c73cbb2ae55f592163ccba6a0d404db9bf8f515d72f0d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a9480aaa4c9402ff2a8313a48c117ffa2d6a6a49 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b176af910ce6fa9ace381e023a4229a5c87cac30aeb47199df7a60a9f8761e0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ce0d4b559eada128ee9498965d19e0cf58b91ee9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87d9b4f124588c7c5610eea9ce79185120816d458ce0d89c6d98d2c7bd37305f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ef3ac76c4afffa3ad633588cda54a2b6f21c897d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18865050fc60ad91c9a6e434d638e553cf9bcfbc2164fd0b76dd3d03322e56ad +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5a7baf657464e2d038f824893ae11e3ace3051ec --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d3be3d1ee42e5d38967abfa42512b46188589f33013cd5a5c5e87b7e498045 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a675b15015d3f512779f1fe2d3963424dcdb9573 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22cc5f40e08949640f2f9743f400417e88a3921c3d642921e4d7d0e1bb59bea9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..41e6afafc503829e9b36fd01bac1f957362d802c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:117cae4db85098d3476ab68b541539a727d1e5723a6b37fca3883e8231c62e0a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e8222168025e41f28bdbb36b515794037c7d56e4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:954e2567974690b4194f07fce601699608845578eb539b0a2ab07666dd1116ac +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..dce7fc118174d2a0f2cd7942c5fa52eb337d44a4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2185ff454fc47b92bfaa90e8fa7f392120e5cdc4e61e2ec36f0a22135ba6e73d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b2250803db9a9655df5c288bca75a6723521878d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deff986e3c964acb9a1a6b500dcb98001a56028b8f3e5080f4223f40108a2c75 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..502a8538bdce73ec9a972c39425d72221198545f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f78a760c044c306d995aa67de8e9c659165b65fb01a446767c1b423707ecb4b9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2c63db09605b3a5aa00798b597344c59393234f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b9769e73bc31f3805f2a3e0e04421b320adf0412bfbc8d4a5de41fc234d33e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..283438fccb7dcf8628d9ae4eee7294c15f8927a4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84a073f9f96aeaf68c6a3880dd6611a7e640d6ed50e8aeebdfe9b41ea261826 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..eef059109f06da19cc479a4cbc56e8fd5bd3a730 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e278b89825dd4634998ab5f54d6763ce7915c2fe7a91ad53bebeff322b709e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9a997745438530b06cb5fdcbbd0e57f629790e3e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885d2e96331d93b60519171e045b30e4a0802ea79c54c82b74056e9bcca8f2af +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0c58d4064fe2c66305c7352679653086884a9f7c --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e715d5583b6e5eadc5ac39372f64d3579843977b33798942657213e68c173db +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d8b4b2a57b72266a73ccff79cf02530b7667e112 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d3965d3cbebbcabad3e980654446dec7d75b6359e0717c357c8a2fcc95cc07 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c7fb992b8006551ff736ae2b01a800f0ff86a4b6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:494b93cb9f1dc372f43a49427a1b7d4b46b27e3ae0d01a4bb80f0ee3df5fafff +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a86041af61d91c209b19e3a8e312feedcc872270 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3030ea97b337a18bba7f203c01d67f07b561543644c0772887358d788e34974f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..af70b1bb1cff44dfcefd86eae8de6cf0e1f7f6e6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151c7aba50178b9bdaf3d5663c4715364d98d9685b6a00072064ce2b275259cc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7c8bd1e6328c3bb24052393705fd5fea04a061f1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703cd9b0b9c424250441c1436a5e6dded0e2cf186c9a052d92424436e4d87b94 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f8570f0174bc89f1b636d70e57800a99dd6af9d4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:899f4193794df4e24a2a1a61c8fc06f02192c6ce07b6ac4196026f9544e905b8 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d9a6c7c34ae1fdb5933f07cb2a702191be6fa5ae --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483f44759a4e50f5ce7ac39c8678c8c93f628d1311ccff4e6512bf188662a075 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9c17cdd03de2f3b9e7808bae8b96909b34b1278b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6eafa2e7dc1627e370b4dec031fefbe457c3417f34f98a5662f1211f5a734ba +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..95f1d902eb0c6745d620bfa3dab906dde880dced --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6245dc47d5126fe0fa5261dc19d6dbd9b178d09387221da232c6f279e9f51f6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ca2be54bb8e1af595d3401375a633fd30d525001 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ed24a37d87eca08e9700d11c8407cbad00b1bc3ec8f9c0411d15a18986a6b67 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..aed20f02c3a58e79d9309e60a42459485ea17d62 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6faff616bbae42d9f802b58416c0a1c0cc5991fd21133de2f37dd8d4f787385d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8472df94ff3200f774bfa13d8a6eb7d4fd217668 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf04eb34f3f4f6a05da0934555816e26f4b98808ae0703133a727404db7e479 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e7cc600690627928eee481d9dbf99993a8d97506 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0943a4874d0a8ad2d4a940b2e5e039f0e74c1efdd515d317d5ca7eef2864bdc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0e27414190ade1b1d806804bf811b9f339e75699 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b914ca6c7b859f93591e68c43fb0260308ed5978533ccf1a35d28b6f3a27b337 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b8eadb7a20a15a19e7ece7c49cb5928f66bca24e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c9ec8768c2efd801e4bc51c46ae83169cb26d1562ccef1c995bd746a731431 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..06dad9f35d19a507baa7f9b6e06f6dc097270f33 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4f41f62bbe293fa6d3a33bdee42a1e2edebeaa6190430d4478cd26ceac20b0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..69ea244d3cc7984493d6745a56bb1d2b3784bc47 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ad2d14f157ee9811e396ca183bbfe442de36b0bf8f3294f0101626b3a9ccd51 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..773575c38b6a7a44fdeb9a3c0981e3c0f3f7e62a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59b74cc5055c1386a6cf5bf345695f5a79fb20f7a2e37f1a4e08f7770aa9fe6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ae6310e43be9ff986ee32c2d0cd65b88ba20d846 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d4abe3783ac35ba6b3eea431c0c2f4ea9113ea1e58a34de7c3729b246da033 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e3f50248d493f17cddc507f1c7ad3329d9b6f39e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958a10c0f41176548f6b53e64429ea0a9332b4628c28eea29cf86af8670efb96 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..50f9084c931b316e789e5483fca278aefc11b794 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:063607059e9f5f804d2695ffd28056bbb167f65239e7669faf0baf4984e2d119 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..be27619c6e84fc56dc57c6dd4d5f378df0881bcd --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe371823ec4bf9047d3e1c17a22d5b713bdd1c1ec93023692f36c1bda9517db7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..549f40fbc8630b4a8afcfbcf883bb447e4afa168 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db5939cd80157fd9e7bc423681ca9d8c5db02b1e226662947a3270ac5904ecc3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3af4fdf8c47413a7ef53531ebbf75183e6d3f3b3 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc3830756ac23805a9673a00a10b4319c90aae953622f45c78b1121a1e5e54a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..86441127488da68f6bf3f9a6677392f1f64ffd63 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb2212dd9db7565c5def9306836a7c4bbc4b47b3e486f9c59694b36c61e4c74 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..83176eabef60c5a8824a78e40276195baa3c2967 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d6dd8f7da22e1026f400727b4dd4c2e79276c0d11c9eac20a979847b98cea2b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8ebd2a2c6e0329b0d71929b6198b645205aed6d8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9522dd6d6f629044f2292424bc28d8f77e38d021d948b7b3f1fb9bfcb7b2bd72 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d08282022fc174f250999c463acd46b070b6fc96 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4aeaa326f5347c9612f364dae8b6ea5fa3d077703ceca4b88ea8ef22d8b0d44 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8cb388285d11fd1fa54b392ea00ba6b875ab2fc2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9884f1f246383d938505e8f2793ebd1c6fa5a733162416c20e07d240cb105c4 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f81bd664fcb32598e4b23e6e474f9e0c7e688891 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f80db84d8658809eb4e0113a57b4a32d6674af3902b23835ff5734fb0ba1ab23 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..86301b3a66b90b4ddbd5df4eff91c3ba08bba0a9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47780bc7316848c7b6d95389ee9d5f3b59271bc8fdaa0143eb7f64cbc8f43acd +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6e2793d2f9777371cda392b5a657ebfe1e3c9dac --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f866adf4a0da01caf64e01751400e0631d924dc301abe2329b634872827470 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f73f8516be54c8984306764a09ccf50b3f49fa90 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5bcc45e6b8475762f2483530cc9b31e33ff7218c2e7d570065164efeab5d40 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..164c3a58583132f2b88f0a2811c78360cd8291bf --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea2ae81986ebfc524b63d14e0b1439a392cf0a24f36e4b2239bcc9ea10097c59 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ac6c9bafefbb6094a2db8b6319283f59566fa6a7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50031d74747cc7be1fbda5457ab21206ad2d7177e9ab913c3c151e0e1cdedd7f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a49fb9252367b88b436e1528f91e261769038dd5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550ce9dc627cf09b848f8d187022c696db15d4b4d96aa5d40c2330263ab7edf4 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3032a21e11a2bb6cbd75c7520eb86314a60a6c8f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1bba0fed2cbdefb58a6be24c165586fd9b2b7ec174b9df4d1bde03afc5158e9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..df23b679dee6fe5ea2381e994c3ef5d5e5957f10 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a25f080e7ecc1920134bf3138381600e8f6278d3c456783ced5479a55ded86e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b671466ce7fb1760c67c924fd19ed034f6f29175 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c90a22b919de7cea0081fd3053fcab66c3ce3fb1d9c23e7d94bad5744ea406d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e515c3cf2f49d39c66d0c1f0776e9e43022203da --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98d34c960e41082536e9058ff03318c606d948830d6fc0f2dfe311deefc7adb5 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e1e00c535f1db7b99e3b4920c47467ef97a02a0a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f647dcd1a5bf4c26852c6d44f129f7009977f4f31614f87518cd2f654515feae +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ae2f36c15287b7a642aba8e19f728249a707c7fe --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd8c7ca3a0436379a78df7e0bb9d3a71814f20fe2a131d256469ba847d36eda +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8f3c6f4b2049639363ae9f2492549a79a4c79efd --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b525ac54469b87bf42e564db65b732e47d7a6c2fedb73ce88fca951ba22c02 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..549e130b8ad9ff81ebc53e8886fcbe99034950f2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad84c1f50bf09f49c56a29cf16885d9df40d38fc70bba9f77631d3f50ef88dc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3c16847ce86ccdf01deaf825aa9244ce2da65d2d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d2141a3525fa9d3bf5d9ba65895943f05edb17f79334b571709c2b706fe69f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d92f5fbaec603cd84ac629cd08a4a1462ae8502d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdfb704cfcaab593ac711e4aae77ea619022cc3a4fccebedb99c03e07acd4221 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7a9931cd580bd1ffedb74e04e8d36a966008aa3a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0fb4e58432fcc655d343f1b157786233a6f061ba5b473fac1fc2bdf4d2e8731 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..518d8254715129087592113d1d3e0c701bc543d5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fde5e3a97ba236ad71911a01094819eb218f11cfa9f3076456f2112b4e6c97f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7aa0cd8064ab6963902f90e48eca626fda8f4d22 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf70ceef26823d0c1cc938492725c76b8dded365305adf21ccc1cc3c93d7de31 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13a36cc7344107c270cb6b23611fd2301bf7ab07 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c42bf52cb40351d95f96c0db17dd10c33fa94d04693c7e248f7cb5d0de3679 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a1bf534172f154c7b7c9ac40cc1204a054731e2a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e036a43462313993d20a840849d761a547570516e378552f8fbded2aba330ee3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f1efcebb6ff3082cb74935b3810f6ec39172db8b --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e473a3904402513304124bf1a060074f51ec0e9122839cb1ba83248aabaf6e2 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e9d8c821ab9e44f38ce61a8d9080b6c3de257b91 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157f933890fedb4c823cd712e521183d30a617797adb85dda96618a084f3a76e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c9587e7136765c6a5dc833e96644d8d15b228959 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c971d6b144c2456130a36a9c929be623af160fc9c264a8abdb50a10612dc790 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1caebefe8440a033c12f75cf96c068b1fb3bbd64 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c298884d6d98c2b30fb47becce550790e69049dd7458eda02a45a6da8ab5a7d7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7fe9fc2f5113f7938b9c7ae79576ff16cee18682 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2901b1b271ec53276260ff2f5dc3c661c01f48dae5ec7072319d7322d526de6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e79daf6ce314f7a01d6acc4bbfa7dcf7498bb80f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58811a274df6dc3738daa8faa99d7efd2b58e8e5103e4e601cfb0b248d7c786d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5cef77d681ee54f0cc4af9171dcd42fd2cf46c6a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75eaccdefa59c17c0011b7d97402414c814fc101d701c025ba718b1b69023b92 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cfd65911a62ba090733de4b6bc3ff563de556647 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d94e54ce7c67532f5abb13e7d835d5051a5fcafb2b6a67f52965c52cc3b901b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e9d97b5a293a58951e18f13a7a8d207f6594327f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a48d97d1a64ca5c2e5c5d6a938a12146bd8cf5955f725278a3326c13e54e6a13 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..73a3f364f954304d417591007ee01d0bb8840c52 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7018a3cb2c6d7f3660eb4aa979fb2bc7b1a42825ddc447b1e211a138ba2a04f6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..201fa8efcf7d860dc00c3d22bf2169ae6c0c0011 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c51ec50ec015db7242529875dea5cb59bb077351717d78bd322813c56a00a933 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4d5a9a53b332a9882ecc1f44138abedf2e599ad2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d26b8a5600c9a0d56b678e5a67061ff53970aa5d6097ee74df1ad2e4c1328418 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e2c034fd1c279da34cda0d5b508443707c632bca --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3498eaed2e1196dd5935a05efb8f7131fe942c70dd416ea455f390f531ad1026 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d1b88c336f7d3fed30f27ca89238d7119d2e536e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fa749747e14693e0d98b2cca42d42c738f88da5383f79c158adec591f4fb6e8 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..80eb892208b76f314df21854583baf70d4de0556 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd43337fdd7dffba2313a262852fe8f8851380fe32d8ce5774ef85467e5b70c6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3b419df5be8d9879cbd2d1ff7b3c867dcebc8d16 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654b54b44370e56baa4b7a4d5910d487ebc396ce72ea7970086a0cc1b9dcb3ac +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..147f05162a8f28fd43a2b0f8332574da913eca03 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92cb91a31a217c19dc7c46199d9eb309a81190573ece397dc77c3fa54f16b559 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..56adfb5f9a699adaa6894159f58a97ae10cef0b7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e85d99960026597a502498514b19707d429d72fc10eb9d1f6c568a2e4c6bc6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..86c1ac0d86156a0787418d925faf005cf94903bb --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242f8ccac985a83c1f88ad62b9e9f281fbbbfca9ee654a982770643cf066ff3c +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1811ed50dbd75991453f1224da7e82f987a88952 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82edbec77fb671dbb090e0b09804bddf4a28814fa09c9afdcb3ff344ade54db3 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a4177abf9b582c459dd1d3d7742769a7db370317 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef75b0858229f8eda1d9581f671bbd9be0e8149840247bb5cd23b3cbd9bd3ccf +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a2a1188fd0ed40c2810ca4aeeccb7b9bd5e919c8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19667683341d203409e6c85fd2a15c451807f182f77bcaed43d8c9441380d84d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0947029cb10daf5dae8a315bc323bd0deb58b681 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d59f172133ded104350eda43da3707f08326fa225de174153105d207a6613fb +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1ae10f7b4c30331c9bbc62955e46fcd4a4ba0e02 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f69e817fa22445e66f0b591d86b1e0360922adda403beeac09e547ead16dc07 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d0600d1ea345d353ceca34fbc0a3c50303a6dcea --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572d9b919d3de1031fb96acaf9b790dffaf55f62d91511c2a62d81d21e960a54 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a6bbb6864bf58f32f5b6c34fcc7500916908acb7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0785afe1b03cebabddee4877ca6426d12411b54829dd13ede2fe314a52476ea0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..85cd1c7bbb968590e7d3e5c00b04241cd055a0c7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:673745205428bfb2c0334a801bea58dbdf6cadbfd450a0988ffac8fb8251e38a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2d427b89da9f5487fbcad216813b7982922d595d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3b3e1d7934e66d32fe5da50d1cad82d76bba59060199e6c944db68b8148151 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cad915f065bd91f57c4e541194629b3dad474eba --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c9dec18c57d77907e74b8794997b8c7543e3a35534ebaf4cbc9dbc9c4b3198 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..697720fdfdabb227c144913a5f8c0e3b69a2c3e9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b1dabec0b1a9f3c84407a2f11aba93e068cb1ee5ce23c71b105ad2ca3173b23 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fead1ac7cbc7561fb1f5d65633584ea39c341fc4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c9dd421badda9a755dea81aea5ee8c3d9416a2b1311bf643320b16dfefc7c7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..62a9d77f0784460b686a4ce4a4aeec0cf53a5dc9 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcc6a085ae9fa32e071955c100498473d71cb8b2f80ad49403cd46d82d13dbe +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8dae3b6367f98a19de674e7a2fac41dd2b5475a5 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d44c146ebe91b86fc70cd49010c73f4395b90548319ff05f673ef898dd705c03 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5d265f72bf2ace5d97ad6ffb742411eb9fd54cf4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47998f6709ee34ca49e4b3b9e844605975a188c3b78a1c011b27e7bc8a945a04 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3a79f726d9f8f510fc4b268ffe6a7e4a5a1bfdbc --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:401f179f0404055285f35e32d5ce55db8cbe98a8ab34157c791893b7055dfbd8 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7559f997ba50db7b5860b511e07a390a2e9beecf --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:174f0e66f6f286e060ff7a2ca5b3b5649e0d80a5e4fdd13904706fc053470662 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a05e4a5defbf895e95f703409ef0779157558288 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:171a9406d94df154be166e6e355d9b1a68c4d4ab9eeb4a1dd2ef5cdee33238fe +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9f1ca0a319b97b27b345a313b5b9befec6626021 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:888aa14779dc88903725e3520f16b9d8e26440de130625c6837973ff711ee156 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8c5ea2e3769a9c1b192907a70260b46afa3ccc87 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2210aaeeb05ac38ece28b4510da3d692524faedf4a4077f0dbb5096fab560b0b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0dfc5042623ef7726b506de13b2ea72b3a3ed53d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf6cad752099c4071014a94b104810e87d4e23f0fb2af82943b89c1e53358da +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..22d203d6ff7a82810ade34a758416299003b3e79 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08bcc4aa4321444238b75c63550b581810188334e03e6f1e49c4c057578a9c28 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..73a9cb8829c9027d91cf163cfb366255bdb336de --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae1ef0f2d60eec07610fee71202468d972f2ecefb6fcc14126fc03fcd693afa7 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b97934ff4a5e37cc6f7004a9504e21ef446cad98 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb2f77c193b6970bf4ad1384d37adf46c32aa96e7f2256fc3f61375d62a6a4c6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..774fa4cb448e88b7016804e36f98de9caa946079 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6572ddc879c4dc351d4157c1b840984b7877312ad841c46cc87f4e17e94dc38b +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3a306df32cfd67a5df3fe08f814f88482c0982de --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c399ac194eb80090cf83de03fca5dd2d103f8ced4b0671e7b2531cb2f972cad +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..62ccb1d3a32ae18ea55d970cfff317d5cf924196 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58ede6c7fc7ee8c6478e979f2e6774303bbb35bb7b8ba77abe6d01d7d775502 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a76e623c527bb98fd0525a88f5da632013b03790 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e63408ba3e0bfc5cff56ccb6762380b472bae6218265fb9d406c71e80b260f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..48a67c20bf22cb3b224863b8500945592550a4d6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b36bf72f9b6b9f9dab158238f72696e23f07d17fc4fb9834d72b17899f6c08a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..871a41b3b54e1247f6265a795e92dfb9f6042f6a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73d557c8896ad42016aacb3da7f2c38c6245ee0f52e2e28d7668309346fb93c +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e279d067e987468dbf195727ac5b82ace4c6e7d2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81c67d9cfb096fb5b23aefe55911ec8119877f7b01313dab88dc52616a8bd04f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6ac56705aa33d9278bd46f80f2485367c0b97220 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95949a8c5afba25be63fe1636ef5d859b3a6ec64d61ecfba63888bae075d62a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8781afb623e45622f58514d124522dba94637d70 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f490369805e13468363543649e9d83ddb5f3299acb3656ecb9d69e185812aa6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..764c9c882ba09583215927883f38a9e99001c0f0 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a1a1f1d112361fe98e45a27dc7eda0a9836ed11aa05cad551638078e2f821f +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4a7c1b030a794380cb6a13698c1b1a2c7c8ad3c1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1434890df5175142208c126db724b68bb850f0e43049410b15907455261c5245 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..40d6e9c507d29f3590a6b9d03636b4c52e284634 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa1e7ca4da87b834bb38cf837400142e4c353d9ceac3e988bf3e664770073859 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..96269f9753f5682ab51195a18d6b2eb9d99ea9e8 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85a6bb378c0276732fa5ebd2524f98a4d00534bbcd9e6196b8074d9b438d3bfc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b103f2b736181d824dd2d4f21a31a894abb2e48f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ced6053cdc69623f9cb3287c0a9dbd0feca93f86db179831ef947266e645d7ee +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5666605d4123088f8af3ca5ee24eab355a49aa8f --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29bbe50271536aa51482cd9e3a23e7468781b6d2bc92465749ceabfd700c146e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..98b09322d858ab2efd45d802392f45dda3efdec2 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e19f724dd5e51083c4934b840d15593555d34c73d5644bb2f3ec21e5d777947a +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1e38654296eb040a6395b7ff1f2255ad62298d64 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bdb0efa7e146bdf2f915a64182a309be20a971462fdce1c1a2e2fe3d6bd854e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6abdbfc34de4c8f896bf012003f3fa56e64ae8ef --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d855d951db8a6a527dcb4945d1df9f63bf1fcb38a56208ec6da8f11e40816099 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..485f3fb32326badb0451f9afa18b8994350864b7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3c8891f4aaa16b49aecf6d3199bb15e71f92b3f8653f2a402cd00fd5a1292bc +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c90ded495a683eb8c0e9c13998070bc9687131e1 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a55e916f5d44701315ebe57cf9d56a954fa400f78354e4d98e67c38fe7dae45e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6deade0856ddc7dc91ef9c00fa4d927c80f17629 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57c44724a36a2e4a735c7b342c4a4e3c29bda50e8d30ebf6baad68f0efe1e44e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..93284f2ea5f41eb2249c2957652cd51766094e79 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381b46a6d60cb9c593e737e83baedc4d0297d347f8f3e0c58cf1b889b9728463 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..766d2bdcdbabb6125fbaf98d5f09a17b7bb91477 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2be6af61df8fc8028cb2ed4823b82c174013a31003ce03ed0e61f68c2cac46a0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..09682ae3f828a2a800fc1f96f5c974c039462d35 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cd0c8944c7b4d1be784622a8e3adc33fbeebce155ccd5b9e994a3dd31c49619 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bd6869cfea04449294db9ea5b0699d90a54a962a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703972533c4b613f4bdc6ce351928dda59ee2cec59fc34c6c0bd97f094ced574 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5306b719b487f5bffb4d7fa1ec87a3b238bd738d --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d92b6ea89ff7e79c79e93c812e077b0393189a96f4d705324568648a46da93 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..697cab55b85665979c25c987059669be4bdffcaf --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6b6543258b68965e55c866f7d3712370fdf4675f55343ffce5577f4969aba0 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3fd4756bf117fdf460b6c5cb44d1eb3c390d29ec --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:926714b35f96c24e7996a0c06b619e1771ff6772669a016ad4cc9c3fe7bfe58d +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_0.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d0d8d3be31b7a6826ea869668b4458560feb3529 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bcaa4fd62284fa807d232298c6e7fd3da4ed23b470f2d1518b8b5931123ca75 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_1.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..037c08bebe42ce76946c29b5ca6a4d6f96150f8a --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51fbf75ace651874c605277597832ddfba8fbc8ec39c3a6a0a45ec51e8db1ea6 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_2.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cc16642a59334d2faa9202812669b3e7c701c0a6 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad49fd003f9d74646837365a150b96793af0cdd4476d1e054ddc950adb08d922 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_3.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0be9f7fc809f9d4260d770b5cbbb6c87d4d50a51 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3b91f7a3bc6255cda972edc6d0163047b7411a39db86a6f59f2737ec8479dce +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_4.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e67f808948da13ede839dda3d21a07850ed5aaa7 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d02173ca2cd40e09ccb72a6938a61ac72153a10dd5abd41b51fb820b91f6f5 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_5.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..089c92c828b5eb88693172f535a183736f996635 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cccfbb074a74ee8be54da1e4b026f2f8e0c35bc9d9ff55e81e45af26a22a1d9 +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_6.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..55d125c7371e21ff1f3250794de5b18d4c0b6a21 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6871bdc415b7dd424b38bcf79704709c78e474d7ccd6c6e8d799e44d2a53f37e +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_7.pkl b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e68fbea2000ef14ed5b723375951c696147e08e4 --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a0c01a9b37c7957f4354a1547aba4dcdd62c1c3a32c17f20154b41f7c10a6c +size 16513 diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/transformer/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/config.json b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/config.json new file mode 100644 index 0000000000000000000000000000000000000000..30655b818888e490a10121f12c4a0fb7a347f92e --- /dev/null +++ b/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 64000, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 32, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 64001, + "width": 56, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_0.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4d127830da39c53b7fe7a5c558bbe0fddd94ec0a --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:044b351f8a336e13961869a2f930f5250bb899cbfa3b9ca6507c0e392cf0085a +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_1.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0ceb18b9e60d026b3c3007f123cba9456eb7f76f --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:203e43ebcb90f796b00eedf91597aafca899fbb35789a58c1df682e0189786ad +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_2.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f1237bbb471b2e2fd5113140ce0f980d7e990f66 --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da03fa260223732f7514165c10dc5f5e510657a5f4b41f8743e1d83b904b43ca +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_3.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a9c4a2fdc0f2065bee9630435bf7bc7064c0dcfa --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:994f569354daa1bae68aa8f65997da9e845c828f0a6c00a6ba6648aec22ba1d5 +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_4.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..28881de0f51cca9b3088e04adb3d070b7117db4d --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ddce88be3e127d77c985aba98c2356afffe27630a671f7d09002f1b8f5e2ec0 +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_5.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..947b9470684064207e8ecf8a19683e4ee62051c6 --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f741e8d5145098a163bd7bb80c1472b4486432659e6e958527fa1c20f94a41 +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_6.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..148b47d75a377651a5c26e4f80a014cf29cc631f --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbcf970838701e93134dc3efbfc89d004ba2f6a54f927037952d5f01943f0d4d +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_7.pkl b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f0b2161d01cefb22e2396a5fda18aaf1b9595a64 --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf3e546871048e5bbcc383b71619ec2ad04b1064c84b7df74635fb8e13efc2c +size 16513 diff --git a/Meissonic/output_256x448_9f_4bs/checkpoint-500/transformer/config.json b/Meissonic/output_256x448_9f_4bs/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d85e49ec2870ffbd20158c74ca9a0541f45e99c7 --- /dev/null +++ b/Meissonic/output_256x448_9f_4bs/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 16, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 65537, + "width": 28, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..baa5cdf9a19e4a156a54fcf2b3e7ac49d84c2841 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3104d8848ef7319cd6640abe22ae30f93b4f29040e6f91c2c938d1f273da2c6 +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_1.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..85f54ed31e891dc6ad6faf5b94413cfde47ab0ee --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615955d37db4aeba41e81931394b975b6dd40c3f23aaf90ee8d60c9db0d95d2f +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_2.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bd36eb91aad6842c02da03a944328e6540d3f906 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c033d983208718dc6026321b60c46e9e4fef1b82eecbdfea5f3ec78341101cf2 +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_3.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cd4777669b9de7f0971d55c993fb33afbec086c2 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae2f6b08d4f2f452ad5791f429229ebe0ebc55c0b963b364bdc0701e9ec1218c +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_4.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4aa529c75d706d71d6b08109671a9ecc39b03e0a --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be1c87d130ed64e28696153a25af41bbe5ec349980c7a2c6b1b854ce58dfdda +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_5.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..76c795ff2177e243c897ea789f0d3227860ae7cf --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f382837c695a2f9b74e0bb078452162c43d399e2128adb6cf4bc7f715efabb +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_6.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2dd4d101b923bc59a5a7135a9cbda4617967de27 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0acbff4259b4a08538bd8d177461b361f94e7fab0857e80b6e2cca7f2246bb +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_7.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b827583d8ff5c386342e36ff6bd36d1bfc3cf7f5 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb31b5334b7649f85bc32cb371dc38ef015fd8560780f8c7fec9d0e7186805d2 +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/transformer/config.json b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6d107d3cdb8e4d23e8d5bf4ab9b518d083941228 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4d127830da39c53b7fe7a5c558bbe0fddd94ec0a --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:044b351f8a336e13961869a2f930f5250bb899cbfa3b9ca6507c0e392cf0085a +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_1.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0ceb18b9e60d026b3c3007f123cba9456eb7f76f --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:203e43ebcb90f796b00eedf91597aafca899fbb35789a58c1df682e0189786ad +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_2.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f1237bbb471b2e2fd5113140ce0f980d7e990f66 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da03fa260223732f7514165c10dc5f5e510657a5f4b41f8743e1d83b904b43ca +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_3.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a9c4a2fdc0f2065bee9630435bf7bc7064c0dcfa --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:994f569354daa1bae68aa8f65997da9e845c828f0a6c00a6ba6648aec22ba1d5 +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_4.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..28881de0f51cca9b3088e04adb3d070b7117db4d --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ddce88be3e127d77c985aba98c2356afffe27630a671f7d09002f1b8f5e2ec0 +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_5.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..947b9470684064207e8ecf8a19683e4ee62051c6 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f741e8d5145098a163bd7bb80c1472b4486432659e6e958527fa1c20f94a41 +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_6.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..148b47d75a377651a5c26e4f80a014cf29cc631f --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbcf970838701e93134dc3efbfc89d004ba2f6a54f927037952d5f01943f0d4d +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_7.pkl b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f0b2161d01cefb22e2396a5fda18aaf1b9595a64 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf3e546871048e5bbcc383b71619ec2ad04b1064c84b7df74635fb8e13efc2c +size 16513 diff --git a/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/transformer/config.json b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/transformer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6d107d3cdb8e4d23e8d5bf4ab9b518d083941228 --- /dev/null +++ b/Meissonic/output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/transformer/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 2, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 4096, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_overfit/checkpoint-1000/config.json b/Meissonic/output_overfit/checkpoint-1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3492dbcee7ac5e7ffe15db97178ed63066e1e542 --- /dev/null +++ b/Meissonic/output_overfit/checkpoint-1000/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 1, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_overfit/checkpoint-1500/config.json b/Meissonic/output_overfit/checkpoint-1500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3492dbcee7ac5e7ffe15db97178ed63066e1e542 --- /dev/null +++ b/Meissonic/output_overfit/checkpoint-1500/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 1, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_overfit/checkpoint-2000/config.json b/Meissonic/output_overfit/checkpoint-2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3492dbcee7ac5e7ffe15db97178ed63066e1e542 --- /dev/null +++ b/Meissonic/output_overfit/checkpoint-2000/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 1, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_overfit/checkpoint-2500/config.json b/Meissonic/output_overfit/checkpoint-2500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3492dbcee7ac5e7ffe15db97178ed63066e1e542 --- /dev/null +++ b/Meissonic/output_overfit/checkpoint-2500/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 1, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_overfit/checkpoint-3000/config.json b/Meissonic/output_overfit/checkpoint-3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3492dbcee7ac5e7ffe15db97178ed63066e1e542 --- /dev/null +++ b/Meissonic/output_overfit/checkpoint-3000/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 1, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/output_overfit/checkpoint-500/config.json b/Meissonic/output_overfit/checkpoint-500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3492dbcee7ac5e7ffe15db97178ed63066e1e542 --- /dev/null +++ b/Meissonic/output_overfit/checkpoint-500/config.json @@ -0,0 +1,31 @@ +{ + "_class_name": "WanDiscreteVideoTransformer", + "_diffusers_version": "0.35.2", + "codebook_size": 65536, + "cross_attn_norm": true, + "dim": 1536, + "eps": 1e-06, + "ffn_dim": 8960, + "freq_dim": 256, + "height": 4, + "in_dim": 16, + "model_type": "t2v", + "num_frames": 1, + "num_heads": 12, + "num_layers": 30, + "out_dim": 16, + "patch_size": [ + 1, + 2, + 2 + ], + "qk_norm": true, + "text_dim": 768, + "text_len": 512, + "vocab_size": 65537, + "width": 7, + "window_size": [ + -1, + -1 + ] +} diff --git a/Meissonic/predict.py b/Meissonic/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..0d24702bbaa8ba3fd16f4588d377808e83f5e240 --- /dev/null +++ b/Meissonic/predict.py @@ -0,0 +1,105 @@ +# Prediction interface for Cog ⚙️ +# https://cog.run/python + +import os +import subprocess +import time +import torch +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +from diffusers import VQModel +from cog import BasePredictor, Input, Path + +from src.transformer import Transformer2DModel +from src.pipeline import Pipeline +from src.scheduler import Scheduler + + +MODEL_CACHE = "model_cache" +MODEL_URL = ( + f"https://weights.replicate.delivery/default/viiika/Meissonic/{MODEL_CACHE}.tar" +) + +os.environ.update( + { + "HF_DATASETS_OFFLINE": "1", + "TRANSFORMERS_OFFLINE": "1", + "HF_HOME": MODEL_CACHE, + "TORCH_HOME": MODEL_CACHE, + "HF_DATASETS_CACHE": MODEL_CACHE, + "TRANSFORMERS_CACHE": MODEL_CACHE, + "HUGGINGFACE_HUB_CACHE": MODEL_CACHE, + } +) + + +def download_weights(url, dest): + start = time.time() + print("downloading url: ", url) + print("downloading to: ", dest) + subprocess.check_call(["pget", "-x", url, dest], close_fds=False) + print("downloading took: ", time.time() - start) + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + if not os.path.exists(MODEL_CACHE): + download_weights(MODEL_URL, MODEL_CACHE) + + model_path = f"{MODEL_CACHE}/MeissonFlow/Meissonic" + model = Transformer2DModel.from_pretrained(model_path, subfolder="transformer") + vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae") + text_encoder = CLIPTextModelWithProjection.from_pretrained( # more stable sampling for some cases + f"{MODEL_CACHE}/laion/CLIP-ViT-H-14-laion2B-s32B-b79K" + ) + tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer") + scheduler = Scheduler.from_pretrained(model_path, subfolder="scheduler") + self.pipe = Pipeline( + vq_model, + tokenizer=tokenizer, + text_encoder=text_encoder, + transformer=model, + scheduler=scheduler, + ).to("cuda") + + def predict( + self, + prompt: str = Input( + description="Input prompt", + default="a photo of an astronaut riding a horse on mars", + ), + negative_prompt: str = Input( + description="Specify things to not see in the output", + default="worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark", + ), + num_inference_steps: int = Input( + description="Number of denoising steps", ge=1, le=100, default=64 + ), + guidance_scale: float = Input( + description="Scale for classifier-free guidance", ge=0, le=20, default=9 + ), + seed: int = Input( + description="Random seed. Leave blank to randomize the seed", default=None + ), + ) -> Path: + """Run a single prediction on the model""" + if seed is None: + seed = int.from_bytes(os.urandom(2), "big") + print(f"Using seed: {seed}") + torch.manual_seed(seed) + + image = self.pipe( + prompt=prompt, + negative_prompt=negative_prompt, + height=1024, + width=1024, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + ).images[0] + output_path = f"/tmp/out.png" + image.save(output_path) + return Path(output_path) diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/.gitattributes b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..14a69d3c1c50c140b42093ed67c383e51f98c237 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +autoencoder.jit filter=lfs diff=lfs merge=lfs -text +decoder.jit filter=lfs diff=lfs merge=lfs -text +encoder.jit filter=lfs diff=lfs merge=lfs -text diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/README.md b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d7338fbf966ac341b466f5d1d10c0e62a67421b6 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/README.md @@ -0,0 +1,326 @@ +--- +license: other +license_name: nvidia-open-model-license +license_link: >- + https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf +library_name: nemo +--- +# **Cosmos Tokenizer**: A suite of image and video tokenizers + +[**Website**](https://research.nvidia.com/labs/dir/cosmos-tokenizer) | [**Code**](https://github.com/NVIDIA/Cosmos-Tokenizer) | [**Video**](https://youtu.be/Soy_myOfWIU) + + +# Model Overview + +## Description: +**Cosmos Tokenizer** is a suite of visual tokenizers for images and videos that delivers various compression rates while maintaining high reconstruction quality. Cosmos Tokenizer can serve as an effective and efficient building block in both diffusion-based and autoregressive models for image and video generation. + + +Our tokenizers come in two types: **Continuous** (C) and **Discrete** (D), each with **Image** (I) and **Video** (V) variants: +* Continuous tokenizers encode visual data into continuous latent embeddings, as shown in latent diffusion models like [Stable Diffusion](https://github.com/CompVis/stable-diffusion). These embeddings are suitable for models that generate data by sampling from continuous distributions. +* Discrete tokenizers encode visual data into discrete latent codes, mapping them into quantized indices, as seen in autoregressive transformers such as [VideoPoet](https://sites.research.google/videopoet/). This discretization is required for models that generate data by optimizing the cross-entropy loss, such as the GPT models. + + +| | Continuous ( C ) | Discrete ( D ) | +| ------------------|---------------------|---------------------| +| **Images ( I )** | Cosmos-Tokenizer-CI | Cosmos-Tokenizer-DI | +| **Videos ( V )** | Cosmos-Tokenizer-CV | Cosmos-Tokenizer-DV | + + +Given an image or a video, Cosmos Tokenizer outputs either continuous latents or discrete tokens. Cosmos Tokenizer achieves spatial compression rates of 8x8 or 16x16 and temporal compression factors of 4x or 8x, resulting in a total compression factor of up to 2048x (=8x16x16). Cosmos Tokenizer delivers 8x more total compression than state-of-the-art (SOTA) methods while simultaneously maintaining higher image quality and running up to 12x faster than the best available SOTA tokenizers. + +**Model Developer**: NVIDIA + +## Model Versions + +The initial release (v1.0) of Cosmos Tokenizer includes the following tokenizers: +* **Continuous Tokenizers** + * Continuous Image (CI) Tokenizer + * [Cosmos-Tokenizer-CI8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CI8x8) (8x8 spatial compression) + * [Cosmos-Tokenizer-CI16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-CI16x16) (16x16 spatial compression) + * Continuous Video (CV) Tokenizer + * [Cosmos-Tokenizer-CV4x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV4x8x8) (4x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8) (8x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-CV8x16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x16x16) (8x temporal compression, 16x16 spatial compression) +* **Discrete Tokenizers** + * Discrete Image (DI) Tokenizer + * [Cosmos-Tokenizer-DI8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DI8x8) (8x8 spatial compression) + * [Cosmos-Tokenizer-DI16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-DI16x16) (16x16 spatial compression) + * Discrete Video (DV) Tokenizer + * [Cosmos-Tokenizer-DV4x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV4x8x8) (4x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-DV8x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV8x8x8) (8x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-DV8x16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV8x16x16) (8x temporal compression, 16x16 spatial compression) + + +### License/Terms of Use: +[NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) + +Under the NVIDIA Open Model License, NVIDIA confirms: + +* Models are commercially usable. +* You are free to create and distribute Derivative Models. +* NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models. + +## Model Architecture: + +We designed Cosmos Tokenizer using a lightweight and computationally efficient architecture, featuring a temporally causal design. Specifically, we employ causal temporal convolution and causal temporal attention layers to preserve the natural temporal order of video frames, ensuring seamless tokenization of images and videos using a single unified network architecture. The encoder and decoder form a symmetrical pair, which are mirrors of each other. The encoder starts with a 2-level [Haar wavelet](https://link.springer.com/book/10.1007/978-3-319-04295-4) transform layer, which down-samples inputs by a factor of 4 in both spatial and temporal dimensions. Likewise, the decoder ends with an inverse wavelet transform. We employ the vanilla autoencoder (AE) formulation to model the latent space for continuous tokenizers. For discrete tokenizers, we adopt the [Finite-Scalar-Quantization](https://openreview.net/forum?id=8ishA3LxN8) (FSQ) as the latent space quantizer. + +![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/638fb8cf2380ffd99caf8c2a/gQH5n9iCEtqZc7uutUwdL.jpeg) + + + +## Input/Output Specifications + +### Encoder +* **Input** + * **Types:** Images or Videos + * **Format:** RGB (Red, Green, Blue) + * **Resolution:** + * Minimum: 256px (shorter side) + * Maximum: Up to 4K + * **Video Length:** Up to 8 seconds for 1080p videos (bounded by A100 80G GPU memory; higher resolutions will have shorter supported durations) + +* **Output** + * **Types:** Tokens + * Continuous Image/Video Tokenizers: Continuous value feature vectors + * Discrete Image/Video Tokenizers: Integer indices + +### Decoder +* **Input** + * **Types:** Tokens from encoder + +* **Output** + * **Types:** Images or Videos (matching input type) + * **Format:** RGB (Red, Green, Blue) + * **Resolution:** Same as input resolution + * **Video Length:** Same as input video length + +## Software Integration (Required For NVIDIA Models Only): +**Runtime Engine(s):** +* [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer) +* [NeMo](https://github.com/NVIDIA/NeMo) (please install the latest version from the GitHub main branch) + +**Supported Hardware Microarchitecture Compatibility:** +* NVIDIA Ampere (e.g., A100) +* NVIDIA Hopper (e.g., H100) + +Note: We have only tested Cosmos Tokenizer with BF16 precision on Ampere and Hopper GPUs. If you are using older versions of NVIDIA GPUs (e.g., NVIDIA Volta GPUs), you may need to switch to FP32 precision. + + +**Operating System(s):** +* Linux (We have not tested on other operating systems.) + +# Usage +Inference Engines: +* [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer) (PyTorch) +* [NeMo](https://github.com/NVIDIA/NeMo) + +## Inference with `Cosmos-Tokenizer` (PyTorch) +### Step-1: Installation of `Cosmos-Tokenizer` +Note: Currently, the `Cosmos-Tokenizer` code is only supported on Linux. + +- Please clone the `Cosmos-Tokenizer` from GitHub repo [github.com/NVIDIA/Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer). + + ```bash + git clone https://github.com/NVIDIA/Cosmos-Tokenizer.git + cd Cosmos-Tokenizer + ``` +- Install dependencies + + ```bash + pip3 install -r requirements.txt + apt-get install -y ffmpeg + ``` + +- Preferably, you could build a docker image using our provided Dockerfile. + ```bash + docker build -t cosmos-docker -f Dockerfile. + # You can run the container as: + docker run --gpus all -it --rm -v /home/${USER}:/home/${USER} \ + --workdir ${PWD} cosmos-docker /bin/bash + ``` + +### Step-2: Download Pre-trained Checkpoints +- Create a local directory for the pre-trained checkpoints and download the +pre-trained checkpoints from HuggingFace. + + ```python + from huggingface_hub import login, snapshot_download + import os + # You could get your Hugging Face token from https://huggingface.co/settings/tokens + login(token=, add_to_git_credential=True) + # You could specify the tokenizers you want to download. + model_names = [ + "Cosmos-Tokenizer-CI8x8", + "Cosmos-Tokenizer-CI16x16", + "Cosmos-Tokenizer-CV4x8x8", + "Cosmos-Tokenizer-CV8x8x8", + "Cosmos-Tokenizer-CV8x16x16", + "Cosmos-Tokenizer-DI8x8", + "Cosmos-Tokenizer-DI16x16", + "Cosmos-Tokenizer-DV4x8x8", + "Cosmos-Tokenizer-DV8x8x8", + "Cosmos-Tokenizer-DV8x16x16", + ] + for model_name in model_names: + hf_repo = "nvidia/" + model_name + local_dir = "pretrained_ckpts/" + model_name + os.makedirs(local_dir, exist_ok=True) + print(f"downloading {model_name} to {local_dir}...") + snapshot_download(repo_id=hf_repo, local_dir=local_dir) + ``` + +- Under the ech checkpoint directory `pretrained_ckpts/`, we provide the encoder, +decoder and the full autoencoder JIT models. + + ```bash + ├── pretrained_ckpts/ + │ ├── Cosmos-Tokenizer-DV8x8x8/ + │ │ ├── encoder.jit + │ │ ├── decoder.jit + │ │ ├── autoencoder.jit + │ ... + ``` + +### Step-3: Run Inference +You can use the following example commands to encode and decode images or videos. For each, the same command works for both continuous and discrete tokenization. Simply provide the proper JIT-compiled ckpt to `checkpoint_enc`, `checkpoint_dec`, or the full autoencoder ckpt to `checkpoint`. + +```python +import torch +from cosmos_tokenizer.video_lib import CausalVideoTokenizer +model_name = "Cosmos-Tokenizer-DV4x8x8" +input_tensor = torch.randn(1, 3, 9, 512, 512).to('cuda').to(torch.bfloat16) +encoder = CausalVideoTokenizer(checkpoint_enc=f'pretrained_ckpts/{model_name}/encoder.jit') +(indices, codes) = encoder.encode(input_tensor) +torch.testing.assert_close(indices.shape, (1, 3, 64, 64)) +torch.testing.assert_close(codes.shape, (1, 6, 3, 64, 64)) + +# The input tensor can be reconstructed by the decoder as: +decoder = CausalVideoTokenizer(checkpoint_dec=f'pretrained_ckpts/{model_name}/decoder.jit') +reconstructed_tensor = decoder.decode(indices) +torch.testing.assert_close(reconstructed_tensor.shape, input_tensor.shape) +``` + +The `indices` will have the shape `(1, 3, 64, 64)` and contain integral values in the range `[1..64K]`, where the first of the three integral maps represents the first frame. +The `codes` will contain the pre-quantization continuous latent with shape `(1, 6, 3, 64, 64)`, where C=6 represents the number of FSQ levels. + +**Note**: More inference usage commands, including both TorchScript (JIT) and PyTorch Inference APIs on real images and videos, can be found on our GitHub repository [github.com/NVIDIA/Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer). + + +## Inference with NeMo + +### Step-1: Install NeMo +Please install NeMo from the GitHub `main` branch following the instructions [here](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#pip-from-a-source-branch). + +### Step-2: Run Inference +Run the following code to tokenize the video: + +```python +import torch +from nemo.collections.common.video_tokenizers.cosmos_vision_tokenizer import CausalVideoTokenizer +model_name = "Cosmos-Tokenizer-DV4x8x8" +model = CausalVideoTokenizer.from_pretrained(model_name) +input_tensor = torch.randn(1, 3, 9, 512, 512).to('cuda').to(torch.bfloat16) +(indices, codes) = model.encode(input_tensor) +``` +Please see the [Cosmos Tokenizer README within the NeMo repository](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/common/video_tokenizers) for additional examples to create training datasets with the Cosmos Tokenizer. + + +# Evaluation + +## TokenizationPerformance Comparison +We have extensively evaluated the **Cosmos Tokenizer** suite on various image and video benchmark datasets. In addition to commonly used datasets such as [MS-COCO](https://cocodataset.org/#home) and [DAVIS](https://davischallenge.org/), in order to cover a wide variety of visual data and standardize the evaluation, we created a benchmark called [TokenBench](https://github.com/NVlabs/Token-Bench), which is a mixed sampling of video data from diverse domains. + +| Tokenizer | Compression Ratio | Quantization | PSNR (DAVIS) | SSIM (DAVIS) | rFVD (DAVIS) | PSNR (TokenBench) | SSIM (TokenBench) | rFVD (TokenBench) | +|-----------|------------------|--------------|--------------|--------------|--------------|------------------|------------------|------------------| +| VideoGPT | 4×4×4 | VQ | 32.23 | **0.850** | 72.33 | 35.11 | **0.914** | **13.85** | +| Omnitokenizer | 4×8×8 | VQ | 28.44 | 0.712 | 188.60 | 30.15 | 0.827 | 53.55 | +| Cosmos-Tokenizer-DV | 4×8×8 | FSQ | **32.98** | 0.818 | **37.36** | **35.13** | 0.887 | 19.67 | +| Cosmos-Tokenizer-DV | 8×8×8 | FSQ | 32.11 | 0.775 | 100.15 | 34.74 | 0.872 | 43.86 | +| Cosmos-Tokenizer-DV | 8×16×16 | FSQ | 31.42 | 0.716 | 241.52 | 33.71 | 0.828 | 113.48 | + +* We compare with the state-of-the-art discrete video tokenizer, [OmniTokenizer](https://github.com/FoundationVision/OmniTokenizer). +* Evaluation metrics: + * Peak Signal-to-Noise Ratio (PSNR) + * Structural Similarity (SSIM) + * Reconstruction Fréchet Video Distance (rFVD) + +## Runtime Comparison + +The following table shows the number of parameters and the averaged encoding and decoding times per image or video frame, measured on a single A100 80GB GPU. For comparison, we also list the parameters and average speeds of prior state-of-the-art tokenizer(s) with the same compression ratio. + +| Tokenizer | Resolution | Compression Ratio | Parameters | Time (ms) | +|----------------|------------|-------------------|------------|-----------| +| OmniTokenizer | 720x1280 | 4×8×8 | 54M | 53.2 | +| Cosmos-DV | 720x1280 | 4×8×8 | 105M | 51.5 | + +Note: We benchmarked the runtime for images under the 8x8 compression and videos under the 4×8×8 compression. Tokenizers with different compression ratios are not included in this comparison. + +## Ethical Considerations +NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. + +For more detailed information on ethical considerations for this model, please see the subcards of Explainability, Bias, Safety & Security, and Privacy below. Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/). + +### Bias + +Field | Response +:---------------------------------------------------------------------------------------------------|:--------------- +Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | None +Measures taken to mitigate against unwanted bias: | None + + +### Explainability + +Field | Response +:------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------- +Intended Application & Domain: | Tokenization of images and videos +Model Type: | Auto-Encoder +Intended Users: | Generative AI developers for image and video generation models +Output: | Images/Videos and Latent Tokens +Describe how the model works: | Compresses and decompresses visual input (image/video). +Technical Limitations: | Due to tokenizer compression limitations, some visual information (such as small text and other structured fine details) may not be reconstructed accurately. +Verified to have met prescribed NVIDIA quality standards: | Yes +Performance Metrics: | Peak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), Reconstruction Fréchet Video Distance (rFVD), Reconstruction Fréchet Inception Distance (rFID), Latency +Potential Known Risks: | Tokenizer's output can parse all forms of input, including what may be considered toxic, offensive, or indecent. +Licensing: | [NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) + + +### Privacy +Field | Response +:----------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------- +Generatable or reverse engineerable personal information? | No +Protected class data used to create this model? | None Known +Was consent obtained for any personal data used? | None Known +How often is dataset reviewed? | Before Release +Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable +If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable +If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable +If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable +Is there provenance for all datasets used in training? | Yes +Does data labeling (annotation, metadata) comply with privacy laws? | Yes +Is data compliant with data subject requests for data correction or removal, if such a request was made? | Not Applicable + +### Safety + +Field | Response +:---------------------------------------------------|:---------------------------------- +Model Application(s): | Tokenization of images and videos +Describe the life critical impact (if present). | None Known +Use Case Restrictions: | See [NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) +Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. Model checkpoints are made available on Hugging Face, and may become available on cloud providers' model catalog. + + +### Plus Plus (++) Promise + +We value you, the datasets, the diversity they represent, and what we have been entrusted with. This model and its associated data have been: +* Verified to comply with current applicable disclosure laws, regulations, and industry standards. +* Verified to comply with applicable privacy labeling requirements. +* Annotated to describe the collector/source (NVIDIA or a third-party). +* Characterized for technical limitations. +* Reviewed to ensure proper disclosure is accessible to, maintained for, and in compliance with NVIDIA data subjects and their requests. +* Reviewed before release. +* Tagged for known restrictions and potential safety implications. + + +# Core Contributors +Fitsum Reda, Jinwei Gu, Xian Liu, Songwei Ge, Ting-Chun Wang, Haoxiang Wang, Ming-Yu Liu \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/autoencoder.jit b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/autoencoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..79301739232c4699b33ff1489a7c8efd636b34f8 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/autoencoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea104b84fee21d170fb20f99027c076ffd97e37b8d43a6a8f6135a2a61cfaf1 +size 211093069 diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/config.json b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bcad561de5279b772db7dd4b76b11d07ddc7ced1 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/config.json @@ -0,0 +1,6 @@ +{ + "architectures": [ + "CosmosTokenizer" + ], +} + \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/decoder.jit b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/decoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..1183024939ae16ac34b3fc29c0d3b61223aa7ced --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/decoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b82dd6f4d489bbeb728e54c828d5a676f17e6eba9b9dfe2dc7839928bee73f +size 125210440 diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/encoder.jit b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/encoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..0992f10de6fafd4e4659bc8277cb958236045a0f --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/encoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a0e8459ab5e0ecfd0c00f215571de43e368f090c16adeb1a69fa835177bdea6 +size 86641076 diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/model_config.yaml b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/model_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5be0900a5551d62cb295248f76186f2f665c51d0 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/model_config.yaml @@ -0,0 +1 @@ +nemo_version: https://github.com/NVIDIA/NeMo/commit/6a5d4b5d19e05262a4182a83613753d424153a8f \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/.gitattributes b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..14a69d3c1c50c140b42093ed67c383e51f98c237 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +autoencoder.jit filter=lfs diff=lfs merge=lfs -text +decoder.jit filter=lfs diff=lfs merge=lfs -text +encoder.jit filter=lfs diff=lfs merge=lfs -text diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/README.md b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c12a96ecf56e164f03b8f41e6b1208849d393c3f --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/README.md @@ -0,0 +1,325 @@ +--- +license: other +license_name: nvidia-open-model-license +license_link: >- + https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf +library_name: nemo +--- +# **Cosmos Tokenizer**: A suite of image and video tokenizers + +[**Website**](https://research.nvidia.com/labs/dir/cosmos-tokenizer) | [**Code**](https://github.com/NVIDIA/Cosmos-Tokenizer) | **Video** + + +# Model Overview + +## Description: +**Cosmos Tokenizer** is a suite of visual tokenizers for images and videos that delivers various compression rates while maintaining high reconstruction quality. Cosmos Tokenizer can serve as an effective and efficient building block in both diffusion-based and autoregressive models for image and video generation. + + +Our tokenizers come in two types: **Continuous** (C) and **Discrete** (D), each with **Image** (I) and **Video** (V) variants: +* Continuous tokenizers encode visual data into continuous latent embeddings, as shown in latent diffusion models like [Stable Diffusion](https://github.com/CompVis/stable-diffusion). These embeddings are suitable for models that generate data by sampling from continuous distributions. +* Discrete tokenizers encode visual data into discrete latent codes, mapping them into quantized indices, as seen in autoregressive transformers such as [VideoPoet](https://sites.research.google/videopoet/). This discretization is required for models that generate data by optimizing the cross-entropy loss, such as the GPT models. + + +| | Continuous ( C ) | Discrete ( D ) | +| ------------------|---------------------|---------------------| +| **Images ( I )** | Cosmos-Tokenizer-CI | Cosmos-Tokenizer-DI | +| **Videos ( V )** | Cosmos-Tokenizer-CV | Cosmos-Tokenizer-DV | + + +Given an image or a video, Cosmos Tokenizer outputs either continuous latents or discrete tokens. Cosmos Tokenizer achieves spatial compression rates of 8x8 or 16x16 and temporal compression factors of 4x or 8x, resulting in a total compression factor of up to 2048x (=8x16x16). Cosmos Tokenizer delivers 8x more total compression than state-of-the-art (SOTA) methods while simultaneously maintaining higher image quality and running up to 12x faster than the best available SOTA tokenizers. + +**Model Developer**: NVIDIA + +## Model Versions + +The initial release (v1.0) of Cosmos Tokenizer includes the following tokenizers: +* **Continuous Tokenizers** + * Continuous Image (CI) Tokenizer + * [Cosmos-Tokenizer-CI8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CI8x8) (8x8 spatial compression) + * [Cosmos-Tokenizer-CI16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-CI16x16) (16x16 spatial compression) + * Continuous Video (CV) Tokenizer + * [Cosmos-Tokenizer-CV4x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV4x8x8) (4x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8) (8x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-CV8x16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x16x16) (8x temporal compression, 16x16 spatial compression) +* **Discrete Tokenizers** + * Discrete Image (DI) Tokenizer + * [Cosmos-Tokenizer-DI8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DI8x8) (8x8 spatial compression) + * [Cosmos-Tokenizer-DI16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-DI16x16) (16x16 spatial compression) + * Discrete Video (DV) Tokenizer + * [Cosmos-Tokenizer-DV4x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV4x8x8) (4x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-DV8x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV8x8x8) (8x temporal compression, 8x8 spatial compression) + * [Cosmos-Tokenizer-DV8x16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV8x16x16) (8x temporal compression, 16x16 spatial compression) + + +### License/Terms of Use: +[NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) + +Under the NVIDIA Open Model License, NVIDIA confirms: + +* Models are commercially usable. +* You are free to create and distribute Derivative Models. +* NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models. + +## Model Architecture: + +We designed Cosmos Tokenizer using a lightweight and computationally efficient architecture, featuring a temporally causal design. Specifically, we employ causal temporal convolution and causal temporal attention layers to preserve the natural temporal order of video frames, ensuring seamless tokenization of images and videos using a single unified network architecture. The encoder and decoder form a symmetrical pair, which are mirrors of each other. The encoder starts with a 2-level [Haar wavelet](https://link.springer.com/book/10.1007/978-3-319-04295-4) transform layer, which down-samples inputs by a factor of 4 in both spatial and temporal dimensions. Likewise, the decoder ends with an inverse wavelet transform. We employ the vanilla autoencoder (AE) formulation to model the latent space for continuous tokenizers. For discrete tokenizers, we adopt the [Finite-Scalar-Quantization](https://openreview.net/forum?id=8ishA3LxN8) (FSQ) as the latent space quantizer. + +![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/638fb8cf2380ffd99caf8c2a/gQH5n9iCEtqZc7uutUwdL.jpeg) + + + +## Input/Output Specifications + +### Encoder +* **Input** + * **Types:** Images or Videos + * **Format:** RGB (Red, Green, Blue) + * **Resolution:** + * Minimum: 256px (shorter side) + * Maximum: Up to 4K + * **Video Length:** Up to 8 seconds for 1080p videos (bounded by A100 80G GPU memory; higher resolutions will have shorter supported durations) + +* **Output** + * **Types:** Tokens + * Continuous Image/Video Tokenizers: Continuous value feature vectors + * Discrete Image/Video Tokenizers: Integer indices + +### Decoder +* **Input** + * **Types:** Tokens from encoder + +* **Output** + * **Types:** Images or Videos (matching input type) + * **Format:** RGB (Red, Green, Blue) + * **Resolution:** Same as input resolution + * **Video Length:** Same as input video length + +## Software Integration (Required For NVIDIA Models Only): +**Runtime Engine(s):** +* [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer) +* [NeMo](https://github.com/NVIDIA/NeMo) (please install the latest version from the GitHub main branch) + +**Supported Hardware Microarchitecture Compatibility:** +* NVIDIA Ampere (e.g., A100) +* NVIDIA Hopper (e.g., H100) + +Note: We have only tested Cosmos Tokenizer with BF16 precision on Ampere and Hopper GPUs. If you are using older versions of NVIDIA GPUs (e.g., NVIDIA Volta GPUs), you may need to switch to FP32 precision. + + +**Operating System(s):** +* Linux (We have not tested on other operating systems.) + +# Usage +Inference Engines: +* [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer) (PyTorch) +* [NeMo](https://github.com/NVIDIA/NeMo) + +## Inference with `Cosmos-Tokenizer` (PyTorch) +### Step-1: Installation of `Cosmos-Tokenizer` +Note: Currently, the `Cosmos-Tokenizer` code is only supported on Linux. + +- Please clone the `Cosmos-Tokenizer` from GitHub repo [github.com/NVIDIA/Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer). + + ```bash + git clone https://github.com/NVIDIA/Cosmos-Tokenizer.git + cd Cosmos-Tokenizer + ``` +- Install dependencies + + ```bash + pip3 install -r requirements.txt + apt-get install -y ffmpeg + ``` + +- Preferably, you could build a docker image using our provided Dockerfile. + ```bash + docker build -t cosmos-docker -f Dockerfile. + # You can run the container as: + docker run --gpus all -it --rm -v /home/${USER}:/home/${USER} \ + --workdir ${PWD} cosmos-docker /bin/bash + ``` + +### Step-2: Download Pre-trained Checkpoints +- Create a local directory for the pre-trained checkpoints and download the +pre-trained checkpoints from HuggingFace. + + ```python + from huggingface_hub import login, snapshot_download + import os + # You could get your Hugging Face token from https://huggingface.co/settings/tokens + login(token=, add_to_git_credential=True) + # You could specify the tokenizers you want to download. + model_names = [ + "Cosmos-Tokenizer-CI8x8", + "Cosmos-Tokenizer-CI16x16", + "Cosmos-Tokenizer-CV4x8x8", + "Cosmos-Tokenizer-CV8x8x8", + "Cosmos-Tokenizer-CV8x16x16", + "Cosmos-Tokenizer-DI8x8", + "Cosmos-Tokenizer-DI16x16", + "Cosmos-Tokenizer-DV4x8x8", + "Cosmos-Tokenizer-DV8x8x8", + "Cosmos-Tokenizer-DV8x16x16", + ] + for model_name in model_names: + hf_repo = "nvidia/" + model_name + local_dir = "pretrained_ckpts/" + model_name + os.makedirs(local_dir, exist_ok=True) + print(f"downloading {model_name} to {local_dir}...") + snapshot_download(repo_id=hf_repo, local_dir=local_dir) + ``` + +- Under the ech checkpoint directory `pretrained_ckpts/`, we provide the encoder, +decoder and the full autoencoder JIT models. + + ```bash + ├── pretrained_ckpts/ + │ ├── Cosmos-Tokenizer-DV8x8x8/ + │ │ ├── encoder.jit + │ │ ├── decoder.jit + │ │ ├── autoencoder.jit + │ ... + ``` + +### Step-3: Run Inference +You can use the following example commands to encode and decode images or videos. For each, the same command works for both continuous and discrete tokenization. Simply provide the proper JIT-compiled ckpt to `checkpoint_enc`, `checkpoint_dec`, or the full autoencoder ckpt to `checkpoint`. + +```python +import torch +from cosmos_tokenizer.video_lib import CausalVideoTokenizer +model_name = "Cosmos-Tokenizer-DV4x8x8" +input_tensor = torch.randn(1, 3, 9, 512, 512).to('cuda').to(torch.bfloat16) +encoder = CausalVideoTokenizer(checkpoint_enc=f'pretrained_ckpts/{model_name}/encoder.jit') +(indices, codes) = encoder.encode(input_tensor) +torch.testing.assert_close(indices.shape, (1, 3, 64, 64)) +torch.testing.assert_close(codes.shape, (1, 6, 3, 64, 64)) + +# The input tensor can be reconstructed by the decoder as: +decoder = CausalVideoTokenizer(checkpoint_dec=f'pretrained_ckpts/{model_name}/decoder.jit') +reconstructed_tensor = decoder.decode(indices) +torch.testing.assert_close(reconstructed_tensor.shape, input_tensor.shape) +``` + +The `indices` will have the shape `(1, 3, 64, 64)` and contain integral values in the range `[1..64K]`, where the first of the three integral maps represents the first frame. +The `codes` will contain the pre-quantization continuous latent with shape `(1, 6, 3, 64, 64)`, where C=6 represents the number of FSQ levels. + +**Note**: More inference usage commands, including both TorchScript (JIT) and PyTorch Inference APIs on real images and videos, can be found on our GitHub repository [github.com/NVIDIA/Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer). + + +## Inference with NeMo + +### Step-1: Install NeMo +Please install NeMo from the GitHub `main` branch following the instructions [here](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#pip-from-a-source-branch). + +### Step-2: Run Inference +Run the following code to tokenize the video: + +```python +import torch +from nemo.collections.common.video_tokenizers.cosmos_vision_tokenizer import CausalVideoTokenizer +model_name = "Cosmos-Tokenizer-DV4x8x8" +model = CausalVideoTokenizer.from_pretrained(model_name) +input_tensor = torch.randn(1, 3, 9, 512, 512).to('cuda').to(torch.bfloat16) +(indices, codes) = model.encode(input_tensor) +``` +Please see the [Cosmos Tokenizer README within the NeMo repository](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/common/video_tokenizers) for additional examples to create training datasets with the Cosmos Tokenizer. + +# Evaluation + +## TokenizationPerformance Comparison +We have extensively evaluated the **Cosmos Tokenizer** suite on various image and video benchmark datasets. In addition to commonly used datasets such as [MS-COCO](https://cocodataset.org/#home) and [DAVIS](https://davischallenge.org/), in order to cover a wide variety of visual data and standardize the evaluation, we created a benchmark called [TokenBench](https://github.com/NVlabs/Token-Bench), which is a mixed sampling of video data from diverse domains. + +| Tokenizer | Compression Ratio | Quantization | PSNR (DAVIS) | SSIM (DAVIS) | rFVD (DAVIS) | PSNR (TokenBench) | SSIM (TokenBench) | rFVD (TokenBench) | +|-----------|------------------|--------------|--------------|--------------|--------------|------------------|------------------|------------------| +| VideoGPT | 4×4×4 | VQ | 32.23 | **0.850** | 72.33 | 35.11 | **0.914** | **13.85** | +| Omnitokenizer | 4×8×8 | VQ | 28.44 | 0.712 | 188.60 | 30.15 | 0.827 | 53.55 | +| Cosmos-Tokenizer-DV | 4×8×8 | FSQ | **32.98** | 0.818 | **37.36** | **35.13** | 0.887 | 19.67 | +| Cosmos-Tokenizer-DV | 8×8×8 | FSQ | 32.11 | 0.775 | 100.15 | 34.74 | 0.872 | 43.86 | +| Cosmos-Tokenizer-DV | 8×16×16 | FSQ | 31.42 | 0.716 | 241.52 | 33.71 | 0.828 | 113.48 | + +* We compare with the state-of-the-art discrete video tokenizer, [OmniTokenizer](https://github.com/FoundationVision/OmniTokenizer). +* Evaluation metrics: + * Peak Signal-to-Noise Ratio (PSNR) + * Structural Similarity (SSIM) + * Reconstruction Fréchet Video Distance (rFVD) + +## Runtime Comparison + +The following table shows the number of parameters and the averaged encoding and decoding times per image or video frame, measured on a single A100 80GB GPU. For comparison, we also list the parameters and average speeds of prior state-of-the-art tokenizer(s) with the same compression ratio. + +| Tokenizer | Resolution | Compression Ratio | Parameters | Time (ms) | +|----------------|------------|-------------------|------------|-----------| +| OmniTokenizer | 720x1280 | 4×8×8 | 54M | 53.2 | +| Cosmos-DV | 720x1280 | 4×8×8 | 105M | 51.5 | + +Note: We benchmarked the runtime for images under the 8x8 compression and videos under the 4×8×8 compression. Tokenizers with different compression ratios are not included in this comparison. + +## Ethical Considerations +NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. + +For more detailed information on ethical considerations for this model, please see the subcards of Explainability, Bias, Safety & Security, and Privacy below. Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/). + +### Bias + +Field | Response +:---------------------------------------------------------------------------------------------------|:--------------- +Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | None +Measures taken to mitigate against unwanted bias: | None + + +### Explainability + +Field | Response +:------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------- +Intended Application & Domain: | Tokenization of images and videos +Model Type: | Auto-Encoder +Intended Users: | Generative AI developers for image and video generation models +Output: | Images/Videos and Latent Tokens +Describe how the model works: | Compresses and decompresses visual input (image/video). +Technical Limitations: | Due to tokenizer compression limitations, some visual information (such as small text and other structured fine details) may not be reconstructed accurately. +Verified to have met prescribed NVIDIA quality standards: | Yes +Performance Metrics: | Peak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), Reconstruction Fréchet Video Distance (rFVD), Reconstruction Fréchet Inception Distance (rFID), Latency +Potential Known Risks: | Tokenizer's output can parse all forms of input, including what may be considered toxic, offensive, or indecent. +Licensing: | [NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) + + +### Privacy +Field | Response +:----------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------- +Generatable or reverse engineerable personal information? | No +Protected class data used to create this model? | None Known +Was consent obtained for any personal data used? | None Known +How often is dataset reviewed? | Before Release +Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable +If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable +If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable +If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable +Is there provenance for all datasets used in training? | Yes +Does data labeling (annotation, metadata) comply with privacy laws? | Yes +Is data compliant with data subject requests for data correction or removal, if such a request was made? | Not Applicable + +### Safety + +Field | Response +:---------------------------------------------------|:---------------------------------- +Model Application(s): | Tokenization of images and videos +Describe the life critical impact (if present). | None Known +Use Case Restrictions: | See [NVIDIA Open Model License](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf) +Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. Model checkpoints are made available on Hugging Face, and may become available on cloud providers' model catalog. + + +### Plus Plus (++) Promise + +We value you, the datasets, the diversity they represent, and what we have been entrusted with. This model and its associated data have been: +* Verified to comply with current applicable disclosure laws, regulations, and industry standards. +* Verified to comply with applicable privacy labeling requirements. +* Annotated to describe the collector/source (NVIDIA or a third-party). +* Characterized for technical limitations. +* Reviewed to ensure proper disclosure is accessible to, maintained for, and in compliance with NVIDIA data subjects and their requests. +* Reviewed before release. +* Tagged for known restrictions and potential safety implications. + + +# Core Contributors +Fitsum Reda, Jinwei Gu, Xian Liu, Songwei Ge, Ting-Chun Wang, Haoxiang Wang, Ming-Yu Liu \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/autoencoder.jit b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/autoencoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..6cd404c2c06d7396433fd28cd35d608ffd261cad --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/autoencoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf00856bc49c9e1c5ca6b01f47ef65d35bcdb37d58724641a3ea45751714724 +size 213071541 diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/config.json b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bcad561de5279b772db7dd4b76b11d07ddc7ced1 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/config.json @@ -0,0 +1,6 @@ +{ + "architectures": [ + "CosmosTokenizer" + ], +} + \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/decoder.jit b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/decoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..6afa147171abff1b954f902dd760b01b1cbfee70 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/decoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:881f1f6317872fad3eeeaa1e595061aa3ee12590d14ce435ac9e9e5c883e797b +size 126792092 diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/encoder.jit b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/encoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..3758c66d1c873532097db33174c5c6e40130864f --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/encoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a014af9f0bfae97a29a5bf071ca58ac29be40d4ffae12ef08a11004b92f0fb8d +size 87042184 diff --git a/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/model_config.yaml b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/model_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5be0900a5551d62cb295248f76186f2f665c51d0 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/model_config.yaml @@ -0,0 +1 @@ +nemo_version: https://github.com/NVIDIA/NeMo/commit/6a5d4b5d19e05262a4182a83613753d424153a8f \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/.gitattributes b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..14a69d3c1c50c140b42093ed67c383e51f98c237 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +autoencoder.jit filter=lfs diff=lfs merge=lfs -text +decoder.jit filter=lfs diff=lfs merge=lfs -text +encoder.jit filter=lfs diff=lfs merge=lfs -text diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/README.md b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dfebdabc2dfa12b157600490ef6e0264abcfd380 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/README.md @@ -0,0 +1,396 @@ +--- +license: other +license_name: nvidia-open-model-license +license_link: >- + https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license +library_name: cosmos +tags: +- nvidia +- nemo +- cosmos +extra_gated_prompt: >- + # NVIDIA Open Model License Agreement + + Version Release Date: January 6, 2025 + + This NVIDIA Open Model License Agreement (the "Agreement") is a legal agreement between the Legal Entity You represent, or if no entity is identified, You and NVIDIA Corporation and its Affiliates ("NVIDIA") and governs Your use of the Models that NVIDIA provides to You under this Agreement. NVIDIA and You are each a "party" and collectively the "parties." + + NVIDIA models released under this Agreement are intended to be used permissively and enable the further development of AI technologies. Subject to the terms of this Agreement, NVIDIA confirms that: + + * Models are commercially usable. + + * You are free to create and distribute Derivative Models. + + * NVIDIA does not claim ownership to any outputs generated using the Models or Model Derivatives. + + By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement. + + ## 1. Definitions + + The following definitions apply to this Agreement: + + 1.1. "NVIDIA Cosmos Model" means a multimodal Model shared under this Agreement. + + 1.2. "Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model. An output is not a Derivative Model. + + 1.3. "Legal Entity" means the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares, or (c) beneficial ownership of such entity. + + 1.4. "Model" means the machine learning model, software, checkpoints, learnt weights, algorithms, parameters, configuration files and documentation shared under this Agreement. + + 1.5. "You" or "Your" means an individual or Legal Entity exercising permissions granted by this Agreement. + + ## 2. Conditions for Use, License Grant, AI Ethics and IP Ownership + + 2.1. Conditions for Use. The Model and any Derivative Model are subject to additional terms as described in Section 2 and Section 3 of this Agreement and govern Your use. If You institute copyright or patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model or a Derivative Model constitutes direct or contributory copyright or patent infringement, then any licenses granted to You under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. If You bypass, disable, reduce the efficacy of, or circumvent any technical limitation, safety guardrail or associated safety guardrail hyperparameter, encryption, security, digital rights management, or authentication mechanism contained in the Model, your rights under this Agreement will automatically terminate. NVIDIA may update this Agreement to comply with legal and regulatory requirements at any time and You agree to either comply with any updated license or cease Your copying, use, and distribution of the Model and any Derivative Model. + + 2.2. License Grant. The rights granted herein are explicitly conditioned on Your full compliance with the terms of this Agreement. Subject to the terms and conditions of this Agreement, NVIDIA hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, revocable (as stated in Section 2.1) license to publicly perform, publicly display, reproduce, use, create derivative works of, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution) and import the Model. + + 2.3. AI Ethics. Use of the Models under the Agreement must be consistent with NVIDIA's Trustworthy AI terms found at https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/. + + 2.4. NVIDIA owns the Model and any Model Derivatives created by NVIDIA. Subject to NVIDIA's underlying ownership rights in the Model or its Model Derivatives, You are and will be the owner of Your Model Derivatives. NVIDIA claims no ownership rights in outputs. You are responsible for outputs and their subsequent uses. Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Model and (b) no other license or right is granted to you by implication, estoppel or otherwise. + + ## 3. Redistribution + + You may reproduce and distribute copies of the Model or Derivative Models thereof in any medium, with or without modifications, provided that You meet the following conditions: + + 3.1. If you distribute the Model, You must give any other recipients of the Model a copy of this Agreement and include the following attribution notice within a "Notice" text file with such copies: "Licensed by NVIDIA Corporation under the NVIDIA Open Model License"; + + 3.2. If you distribute or make available a NVIDIA Cosmos Model, or a product or service (including an AI model) that contains or uses a NVIDIA Cosmos Model, use a NVIDIA Cosmos Model to create a Derivative Model, or use a NVIDIA Cosmos Model or its outputs to create, train, fine tune, or otherwise improve an AI model, you will include "Built on NVIDIA Cosmos" on a related website, user interface, blogpost, about page, or product documentation; and + + 3.3. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Models as a whole, provided Your use, reproduction, and distribution of the Model otherwise complies with the conditions stated in this Agreement. + + ## 4. Trademarks + + This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NVIDIA, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the "Notice" text file. + + ## **5. Disclaimer of Warranty** + + **Unless required by applicable law or agreed to in writing, NVIDIA provides the Model on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivative Models and outputs and assume any risks associated with Your exercise of permissions under this Agreement.** + + ## **6. Limitation of Liability** + + **In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, will NVIDIA be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this Agreement or out of the use or inability to use the Model, Derivative Models or outputs (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if NVIDIA has been advised of the possibility of such damages.** + + ## 7. Indemnity + + You will indemnify and hold harmless NVIDIA from and against any claim by any third party arising out of or related to your use or distribution of the Model, Model Derivatives or outputs. + + ## 8. Feedback + + NVIDIA appreciates your feedback, and You agree that NVIDIA may use it without restriction or compensation to You. + + ## 9. Governing Law + + This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that, either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. + + ## 10. Trade and Compliance + + You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use. +extra_gated_fields: + By clicking Submit below, I accept the terms of the NVIDIA Open Model License Agreement and acknowledge that I am an adult of legal age of majority in the country in which the Cosmos Models will be used and have authority to accept this Agreement: checkbox +extra_gated_description: >- + The information you provide will be collected, stored, processed and shared in accordance with the [NVIDIA Privacy Policy](https://www.nvidia.com/en-us/about-nvidia/privacy-policy/). +extra_gated_button_content: Submit +--- +# **Cosmos Tokenizer**: A suite of image and video tokenizers + +[**Website**](https://research.nvidia.com/labs/dir/cosmos-tokenizer) | [**GitHub**](https://github.com/NVIDIA/Cosmos-Tokenizer) | [**NVIDIA News**](https://blogs.nvidia.com/blog/robot-learning-humanoid-development/) | [**NVIDIA Blog**](https://developer.nvidia.com/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/) | [**Hugging Face**](https://huggingface.co/collections/nvidia/cosmos-tokenizer-672b93023add81b66a8ff8e6) | [**YouTube**](https://youtu.be/Soy_myOfWIU) | [**Paper**](https://arxiv.org/abs/2501.03575) + +# Model Overview + +## Description: +**Cosmos Tokenizer** is a suite of visual tokenizers for images and videos that delivers various compression rates while maintaining high reconstruction quality. Cosmos Tokenizer can serve as an effective and efficient building block in both diffusion-based and autoregressive models for image and video generation. This model is ready for commercial use. + + +Our tokenizers come in two types: **Continuous** (C) and **Discrete** (D), each with **Image** (I) and **Video** (V) variants: +* Continuous tokenizers encode visual data into continuous latent embeddings, as shown in latent diffusion models like [Stable Diffusion](https://github.com/CompVis/stable-diffusion). These embeddings are suitable for models that generate data by sampling from continuous distributions. +* Discrete tokenizers encode visual data into discrete latent codes, mapping them into quantized indices, as seen in autoregressive transformers such as [VideoPoet](https://sites.research.google/videopoet/). This discretization is required for models that generate data by optimizing the cross-entropy loss, such as the GPT models. + + +| | Continuous ( C ) | Discrete ( D ) | +| ------------------|---------------------|---------------------| +| **Images ( I )** | Cosmos-Tokenizer-CI | Cosmos-Tokenizer-DI | +| **Videos ( V )** | Cosmos-Tokenizer-CV | Cosmos-Tokenizer-DV | + + +Given an image or a video, Cosmos Tokenizer outputs either continuous latents or discrete tokens. Cosmos Tokenizer achieves spatial compression rates of 8x8 or 16x16 and temporal compression factors of 4x or 8x, resulting in a total compression factor of up to 2048x (=8x16x16). Cosmos Tokenizer delivers 8x more total compression than state-of-the-art (SOTA) methods while simultaneously maintaining higher image quality and running up to 12x faster than the best available SOTA tokenizers. + +**Model Developer**: NVIDIA + +## Model Versions + +This release (v1.0) of Cosmos Tokenizer includes the following tokenizers: +* **Continuous Tokenizers** + * [Cosmos-1.0-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-CV8x8x8) (8x temporal compression, 8x8 spatial compression, 720 short spatial size, 121 frames context) +* **Discrete Tokenizers** + * [Cosmos-1.0-Tokenizer-DV8x16x16](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-DV8x16x16) (8x temporal compression, 16x16 spatial compression, 720 short spatial size, 49 frames context) + +The previous release (v0.1) of Cosmos Tokenizer included the following tokenizers: +* **Continuous Tokenizers** + * Continuous Image (CI) Tokenizer + * [Cosmos-0.1-Tokenizer-CI8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CI8x8) (8x8 spatial compression) + * [Cosmos-0.1-Tokenizer-CI16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-CI16x16) (16x16 spatial compression) + * Continuous Video (CV) Tokenizer + * [Cosmos-0.1-Tokenizer-CV4x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV4x8x8) (4x temporal compression, 8x8 spatial compression) + * [Cosmos-0.1-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8) (8x temporal compression, 8x8 spatial compression) + * [Cosmos-0.1-Tokenizer-CV8x16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x16x16) (8x temporal compression, 16x16 spatial compression) +* **Discrete Tokenizers** + * Discrete Image (DI) Tokenizer + * [Cosmos-0.1-Tokenizer-DI8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DI8x8) (8x8 spatial compression) + * [Cosmos-0.1-Tokenizer-DI16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-DI16x16) (16x16 spatial compression) + * Discrete Video (DV) Tokenizer + * [Cosmos-0.1-Tokenizer-DV4x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV4x8x8) (4x temporal compression, 8x8 spatial compression) + * [Cosmos-1.0-Tokenizer-DV8x8x8](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV8x8x8) (8x temporal compression, 8x8 spatial compression) + * [Cosmos-1.0-Tokenizer-DV8x16x16](https://huggingface.co/nvidia/Cosmos-Tokenizer-DV8x16x16) (8x temporal compression, 16x16 spatial compression) + + + +### License: +This model is released under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). For a custom license, please contact [cosmos-license@nvidia.com](mailto:cosmos-license@nvidia.com). + +Under the NVIDIA Open Model License, NVIDIA confirms: + +* Models are commercially usable. +* You are free to create and distribute Derivative Models. +* NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models. + +**Important Note**: If you bypass, disable, reduce the efficacy of, or circumvent any technical limitation, safety guardrail or +associated safety guardrail hyperparameter, encryption, security, digital rights management, or authentication mechanism contained +in the Model, your rights under [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license) will automatically terminate. + +## Model Architecture: + +We designed Cosmos Tokenizer using a lightweight and computationally efficient architecture, featuring a temporally causal design. Specifically, we employ causal temporal convolution and causal temporal attention layers to preserve the natural temporal order of video frames, ensuring seamless tokenization of images and videos using a single unified network architecture. The encoder and decoder form a symmetrical pair, which are mirrors of each other. The encoder starts with a 2-level [Haar wavelet](https://link.springer.com/book/10.1007/978-3-319-04295-4) transform layer, which down-samples inputs by a factor of 4 in both spatial and temporal dimensions. Likewise, the decoder ends with an inverse wavelet transform. We employ the vanilla autoencoder (AE) formulation to model the latent space for continuous tokenizers. For discrete tokenizers, we adopt the [Finite-Scalar-Quantization](https://openreview.net/forum?id=8ishA3LxN8) (FSQ) as the latent space quantizer. + +![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/638fb8cf2380ffd99caf8c2a/gQH5n9iCEtqZc7uutUwdL.jpeg) + + + +## Input/Output Specifications + +### Encoder +* **Input** + * **Type:** Images or Videos + * **Format:** RGB (Red, Green, Blue) + * **Properties:** + * **Resolution:** Minimum: 256px (shorter side). Maximum: Up to 4K + * **Video Length:** Up to 8 seconds for 1080p videos (bounded by A100 80G GPU memory; higher resolutions will have shorter supported durations) + +* **Output** + * **Type:** Tokens + * **Properties:** + * Integer indices ranging from 0 to 63,999 + +### Decoder +* **Input** + * **Type:** Tokens + * **Properties:** + * Integer indices ranging from 0 to 63,999 + +* **Output** + * **Type** Images or Videos (matching input type) + * **Format:** RGB (Red, Green, Blue) + * **Properties:** + * **Resolution:** Same as input resolution + * **Video Length:** Same as input video length + +## Software Integration (Required For NVIDIA Models Only): +**Runtime Engine(s):** +* [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer) + +**Supported Hardware Microarchitecture Compatibility:** +* NVIDIA Ampere (e.g., A100) +* NVIDIA Hopper (e.g., H100) + +Note: We have only tested Cosmos Tokenizer with BF16 precision on Ampere and Hopper GPUs. If you are using older versions of NVIDIA GPUs (e.g., NVIDIA Volta GPUs), you may need to switch to FP32 precision. + + +**Operating System(s):** +* Linux (We have not tested on other operating systems.) + +# Usage +Inference Engines: +* [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer) (PyTorch) + +## Inference with `Cosmos-Tokenizer` (PyTorch) +### Step-1: Installation of `Cosmos-Tokenizer` +Note: Currently, the `Cosmos-Tokenizer` code is only supported on Linux. + +- Please clone the `Cosmos-Tokenizer` from GitHub repo [github.com/NVIDIA/Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer). + + ```bash + git clone https://github.com/NVIDIA/Cosmos-Tokenizer.git + cd Cosmos-Tokenizer + ``` +- Install dependencies + + ```bash + pip3 install -r requirements.txt + apt-get install -y ffmpeg + ``` + +- Preferably, you could build a docker image using our provided Dockerfile. + ```bash + docker build -t cosmos-docker -f Dockerfile. + # You can run the container as: + docker run --gpus all -it --rm -v /home/${USER}:/home/${USER} \ + --workdir ${PWD} cosmos-docker /bin/bash + ``` + +### Step-2: Download Pre-trained Checkpoints +- Create a local directory for the pre-trained checkpoints and download the +pre-trained checkpoints from HuggingFace. + + ```python + from huggingface_hub import login, snapshot_download + import os + # You could get your Hugging Face token from https://huggingface.co/settings/tokens + login(token=, add_to_git_credential=True) + # You could specify the tokenizers you want to download. + model_names = [ + "Cosmos-1.0-Tokenizer-DV8x16x16", + ] + for model_name in model_names: + hf_repo = "nvidia/" + model_name + local_dir = "pretrained_ckpts/" + model_name + os.makedirs(local_dir, exist_ok=True) + print(f"downloading {model_name} to {local_dir}...") + snapshot_download(repo_id=hf_repo, local_dir=local_dir) + ``` + +- Under the ech checkpoint directory `pretrained_ckpts/`, we provide the encoder, +decoder and the full autoencoder JIT models. + + ```bash + ├── pretrained_ckpts/ + │ ├── Cosmos-1.0-Tokenizer-DV8x16x16/ + │ │ ├── encoder.jit + │ │ ├── decoder.jit + │ │ ├── autoencoder.jit + │ ... + ``` + +### Step-3: Run Inference +You can use the following example commands to encode and decode images or videos. For each, the same command works for both continuous and discrete tokenization. Simply provide the proper JIT-compiled ckpt to `checkpoint_enc`, `checkpoint_dec`, or the full autoencoder ckpt to `checkpoint`. + +```python +import torch +from cosmos_tokenizer.video_lib import CausalVideoTokenizer +model_name = "Cosmos-1.0-Tokenizer-DV8x16x16" +input_tensor = torch.randn(1, 3, 9, 512, 512).to('cuda').to(torch.bfloat16) +encoder = CausalVideoTokenizer(checkpoint_enc=f'pretrained_ckpts/{model_name}/encoder.jit') +(indices, codes) = encoder.encode(input_tensor) +torch.testing.assert_close(indices.shape, (1, 2, 32, 32)) +torch.testing.assert_close(codes.shape, (1, 6, 2, 32, 32)) + +# The input tensor can be reconstructed by the decoder as: +decoder = CausalVideoTokenizer(checkpoint_dec=f'pretrained_ckpts/{model_name}/decoder.jit') +reconstructed_tensor = decoder.decode(indices) +torch.testing.assert_close(reconstructed_tensor.shape, input_tensor.shape) +``` + +The `indices` will have the shape `(1, 2, 32, 32)` and contain integral values in the range `[1..64K]`, where the first of the three integral maps represents the first frame. +The `codes` will contain the pre-quantization continuous latent with shape `(1, 6, 2, 32, 32)`, where C=6 represents the number of FSQ levels. + +**Note**: More inference usage commands, including both TorchScript (JIT) and PyTorch Inference APIs on real images and videos, can be found on our GitHub repository [github.com/NVIDIA/Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer). + + +# Evaluation + +## TokenizationPerformance Comparison +We have extensively evaluated the additional **Cosmos Tokenizer** models on [DAVIS](https://davischallenge.org/) video benchmark dataset. + +| Tokenizer | Compression Ratio | Height | Num. of Frames | Quantization | PSNR (DAVIS) | SSIM (DAVIS) | rFVD (DAVIS) | +|-----------|------------------|--------------|--------------|--------------|--------------|--------------|--------------| +| VideoGPT | 4×4×4 | - | - | VQ | 32.23 | **0.850** | 72.33 | +| OmniTokenizer | 4×8×8 | - | - | VQ | 28.44 | 0.712 | 188.60 | +| Cosmos-Tokenizer-DV | 4×8×8 | 720 | 17 | FSQ | **32.98** | 0.818 | **37.36** | +| Cosmos-Tokenizer-DV | 8×8×8 | 720 | 17 | FSQ | 32.11 | 0.775 | 100.15 | +| Cosmos-Tokenizer-DV | 8×16×16 | 720 | 17 | FSQ | 31.42 | 0.716 | 241.52 | +| Cosmos-Tokenizer-DV | 8×16×16 | 720 | 49 | FSQ | 31.59 | 0.719 | 259.33 | + + +* We compare with the state-of-the-art discrete video tokenizer, [OmniTokenizer](https://github.com/FoundationVision/OmniTokenizer). +* Evaluation metrics: + * Peak Signal-to-Noise Ratio (PSNR) + * Structural Similarity (SSIM) + * Reconstruction Fréchet Video Distance (rFVD) + +## Runtime Comparison + +The following table shows the number of parameters and the averaged encoding and decoding times per image or video frame, measured on a single A100 80GB GPU. For comparison, we also list the parameters and average speeds of prior state-of-the-art tokenizer(s) with the same compression ratio. + +| Tokenizer | Resolution | Compression Ratio | Parameters | Time (ms) | +|----------------|------------|-------------------|------------|-----------| +| OmniTokenizer | 720x1280 | 4×8×8 | 54M | 53.2 | +| Cosmos-DV | 720x1280 | 4×8×8 | 105M | 51.5 | + +Note: We benchmarked the runtime for images under the 8x8 compression and videos under the 4×8×8 compression. Tokenizers with different compression ratios are not included in this comparison. + +## Ethical Considerations +NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. + +### Plus Plus (++) Promise + +We value you, the datasets, the diversity they represent, and what we have been entrusted with. This model and its associated data have been: +* Verified to comply with current applicable disclosure laws, regulations, and industry standards. +* Verified to comply with applicable privacy labeling requirements. +* Annotated to describe the collector/source (NVIDIA or a third-party). +* Characterized for technical limitations. +* Reviewed to ensure proper disclosure is accessible to, maintained for, and in compliance with NVIDIA data subjects and their requests. +* Reviewed before release. +* Tagged for known restrictions and potential safety implications. + +For more detailed information on ethical considerations for this model, please see the subcards of Explainability, Bias, Safety & Security, and Privacy below. Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/). + +### Bias + +Field | Response +:---------------------------------------------------------------------------------------------------|:--------------- +Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | None +Measures taken to mitigate against unwanted bias: | None + + +### Explainability + +Field | Response +:------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------- +Intended Application & Domain: | Tokenization of images and videos +Model Type: | Auto-Encoder +Intended Users: | Generative AI developers for image and video generation models +Output: | Images/Videos and Latent Tokens +Describe how the model works: | Compresses and decompresses visual input (image/video). +Technical Limitations: | Due to tokenizer compression limitations, some visual information (such as small text and other structured fine details) may not be reconstructed accurately. The tokenizers may not produce as high of a reconstruction results for videos with low resolution, e.g. less than 320p. +Verified to have met prescribed NVIDIA quality standards: | Yes +Performance Metrics: | Peak Signal-to-Noise Ratio (PSNR), Structural Similarity (SSIM), Reconstruction Fréchet Video Distance (rFVD), Reconstruction Fréchet Inception Distance (rFID), Latency +Potential Known Risks: | Tokenizer's output can parse all forms of input, including what may be considered toxic, offensive, or indecent. +Licensing: | [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license) + + +### Privacy +Field | Response +:----------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------- +Generatable or reverse engineerable personal information? | No +Protected class data used to create this model? | None Known +Was consent obtained for any personal data used? | None Known +How often is dataset reviewed? | Before Release +Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable +If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable +If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable +If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable +Is there provenance for all datasets used in training? | Yes +Does data labeling (annotation, metadata) comply with privacy laws? | Yes +Is data compliant with data subject requests for data correction or removal, if such a request was made? | Not Applicable + +### Safety + +Field | Response +:---------------------------------------------------|:---------------------------------- +Model Application(s): | Tokenization of images and videos +Describe the life critical impact (if present). | None Known +Use Case Restrictions: | [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license) +Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. Model checkpoints are made available on Hugging Face, and may become available on cloud providers' model catalog. + + +# Core Contributors +Fitsum Reda, Jinwei Gu, Xian Liu, Songwei Ge, Ting-Chun Wang, Haoxiang Wang, Ming-Yu Liu \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/autoencoder.jit b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/autoencoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..7ca041131d5553bded0ca8122cb0821ea70ae0b0 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/autoencoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6871316ea4e6bd14a5f82c87c48ba0bcd853496830034a38701bc0ccd501c89 +size 223576773 diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/config.json b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bcad561de5279b772db7dd4b76b11d07ddc7ced1 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/config.json @@ -0,0 +1,6 @@ +{ + "architectures": [ + "CosmosTokenizer" + ], +} + \ No newline at end of file diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/decoder.jit b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/decoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..59eed113cb2aeeebba6f9c6603bddb812aa4cf4c --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/decoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40add3d4d7c00e0d4ee30fcc69c2171fa30cb045b70b7e5f979ad66419e8dcd9 +size 132042180 diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/encoder.jit b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/encoder.jit new file mode 100644 index 0000000000000000000000000000000000000000..2cf3df91b8de7a5d1eeab660bdbe68cea1f7f1f6 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/encoder.jit @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c760a14262cbc5db93c09bcb4710fa178e0e27928e4a75170f07db7225d4a8 +size 92292848 diff --git a/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/model_config.yaml b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/model_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5be0900a5551d62cb295248f76186f2f665c51d0 --- /dev/null +++ b/Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/model_config.yaml @@ -0,0 +1 @@ +nemo_version: https://github.com/NVIDIA/NeMo/commit/6a5d4b5d19e05262a4182a83613753d424153a8f \ No newline at end of file diff --git a/Meissonic/requirements.txt b/Meissonic/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..edb79a0c0c1a408d8363207385b74c11c8cf4b9f --- /dev/null +++ b/Meissonic/requirements.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/cu124 +accelerate +pytorch-lightning +torch +torchvision +tqdm +transformers +numpy +gradio +diffusers +bitsandbytes +open_clip_torch +datasets +peft +pillow +wandb +dask +pyarrow +huggingface_hub +peft +flash-attn --no-build-isolation +sentencepiece \ No newline at end of file diff --git a/Meissonic/src/__pycache__/pipeline.cpython-310.pyc b/Meissonic/src/__pycache__/pipeline.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0159a7b5f0d121615aae9c29abe1c8fc2dbee8c3 Binary files /dev/null and b/Meissonic/src/__pycache__/pipeline.cpython-310.pyc differ diff --git a/Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc b/Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4314f86aa21eeadb4e5afd02b642da5fe4784589 Binary files /dev/null and b/Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc differ diff --git a/Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc b/Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2db68c36948243e0c9555ee00478288e84cba927 Binary files /dev/null and b/Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc differ diff --git a/Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc b/Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6765ea05253811065bca9d8f951a181c500af22 Binary files /dev/null and b/Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc differ diff --git a/Meissonic/src/__pycache__/scheduler.cpython-310.pyc b/Meissonic/src/__pycache__/scheduler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20bac6dbfaad2214f74b7135857ae40dbf76517e Binary files /dev/null and b/Meissonic/src/__pycache__/scheduler.cpython-310.pyc differ diff --git a/Meissonic/src/__pycache__/scheduler.cpython-313.pyc b/Meissonic/src/__pycache__/scheduler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31b67a17a1df5e3439c50e3970700d2ad89cff08 Binary files /dev/null and b/Meissonic/src/__pycache__/scheduler.cpython-313.pyc differ diff --git a/Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc b/Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22662d71b201f374f49c6deba45e91ea9df8e976 Binary files /dev/null and b/Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc differ diff --git a/Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc b/Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20c3a2c918aed8bce69a58d8c707a75ea589362c Binary files /dev/null and b/Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc differ diff --git a/Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc b/Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c14ad2d213934d2a63348c5b51db51b15d46d716 Binary files /dev/null and b/Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc differ diff --git a/Meissonic/src/__pycache__/transformer.cpython-310.pyc b/Meissonic/src/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34642b80cc313d964a27573c39c233114b05e99b Binary files /dev/null and b/Meissonic/src/__pycache__/transformer.cpython-310.pyc differ diff --git a/Meissonic/src/__pycache__/transformer.cpython-313.pyc b/Meissonic/src/__pycache__/transformer.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bb8b2a63399d12fdaf6df3b35da6494b7a6514d Binary files /dev/null and b/Meissonic/src/__pycache__/transformer.cpython-313.pyc differ diff --git a/Meissonic/src/__pycache__/transformer_video.cpython-310.pyc b/Meissonic/src/__pycache__/transformer_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edf48f4feeebea45785b6f2cbde53855583348e6 Binary files /dev/null and b/Meissonic/src/__pycache__/transformer_video.cpython-310.pyc differ diff --git a/Meissonic/src/__pycache__/transformer_video.cpython-313.pyc b/Meissonic/src/__pycache__/transformer_video.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6773c1650cd9d3bbe17c43312a820d4101d50d1 Binary files /dev/null and b/Meissonic/src/__pycache__/transformer_video.cpython-313.pyc differ diff --git a/Meissonic/src/__pycache__/transformer_video.cpython-314.pyc b/Meissonic/src/__pycache__/transformer_video.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f16fdf1a752f73b689c814c0ec50e3e4b9e450ad Binary files /dev/null and b/Meissonic/src/__pycache__/transformer_video.cpython-314.pyc differ diff --git a/Meissonic/src/attention.py b/Meissonic/src/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..4dbbe03fc79e1eb1509dfd98720b60196144878d --- /dev/null +++ b/Meissonic/src/attention.py @@ -0,0 +1,179 @@ +# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. +import torch + +try: + import flash_attn_interface + FLASH_ATTN_3_AVAILABLE = True +except ModuleNotFoundError: + FLASH_ATTN_3_AVAILABLE = False + +try: + import flash_attn + FLASH_ATTN_2_AVAILABLE = True +except ModuleNotFoundError: + FLASH_ATTN_2_AVAILABLE = False + +import warnings + +__all__ = [ + 'flash_attention', + 'attention', +] + + +def flash_attention( + q, + k, + v, + q_lens=None, + k_lens=None, + dropout_p=0., + softmax_scale=None, + q_scale=None, + causal=False, + window_size=(-1, -1), + deterministic=False, + dtype=torch.bfloat16, + version=None, +): + """ + q: [B, Lq, Nq, C1]. + k: [B, Lk, Nk, C1]. + v: [B, Lk, Nk, C2]. Nq must be divisible by Nk. + q_lens: [B]. + k_lens: [B]. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + causal: bool. Whether to apply causal attention mask. + window_size: (left right). If not (-1, -1), apply sliding window local attention. + deterministic: bool. If True, slightly slower and uses more memory. + dtype: torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16. + """ + half_dtypes = (torch.float16, torch.bfloat16) + assert dtype in half_dtypes + assert q.device.type == 'cuda' and q.size(-1) <= 256 + + # params + b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype + + def half(x): + return x if x.dtype in half_dtypes else x.to(dtype) + + # preprocess query + if q_lens is None: + q = half(q.flatten(0, 1)) + q_lens = torch.tensor( + [lq] * b, dtype=torch.int32).to( + device=q.device, non_blocking=True) + else: + q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)])) + + # preprocess key, value + if k_lens is None: + k = half(k.flatten(0, 1)) + v = half(v.flatten(0, 1)) + k_lens = torch.tensor( + [lk] * b, dtype=torch.int32).to( + device=k.device, non_blocking=True) + else: + k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)])) + v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)])) + + q = q.to(v.dtype) + k = k.to(v.dtype) + + if q_scale is not None: + q = q * q_scale + + if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE: + warnings.warn( + 'Flash attention 3 is not available, use flash attention 2 instead.' + ) + + # apply attention + if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE: + # Note: dropout_p, window_size are not supported in FA3 now. + x = flash_attn_interface.flash_attn_varlen_func( + q=q, + k=k, + v=v, + cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + seqused_q=None, + seqused_k=None, + max_seqlen_q=lq, + max_seqlen_k=lk, + softmax_scale=softmax_scale, + causal=causal, + deterministic=deterministic)[0].unflatten(0, (b, lq)) + else: + assert FLASH_ATTN_2_AVAILABLE + x = flash_attn.flash_attn_varlen_func( + q=q, + k=k, + v=v, + cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + max_seqlen_q=lq, + max_seqlen_k=lk, + dropout_p=dropout_p, + softmax_scale=softmax_scale, + causal=causal, + window_size=window_size, + deterministic=deterministic).unflatten(0, (b, lq)) + + # output + return x.type(out_dtype) + + +def attention( + q, + k, + v, + q_lens=None, + k_lens=None, + dropout_p=0., + softmax_scale=None, + q_scale=None, + causal=False, + window_size=(-1, -1), + deterministic=False, + dtype=torch.bfloat16, + fa_version=None, +): + if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE: + return flash_attention( + q=q, + k=k, + v=v, + q_lens=q_lens, + k_lens=k_lens, + dropout_p=dropout_p, + softmax_scale=softmax_scale, + q_scale=q_scale, + causal=causal, + window_size=window_size, + deterministic=deterministic, + dtype=dtype, + version=fa_version, + ) + else: + if q_lens is not None or k_lens is not None: + warnings.warn( + 'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.' + ) + attn_mask = None + + q = q.transpose(1, 2).to(dtype) + k = k.transpose(1, 2).to(dtype) + v = v.transpose(1, 2).to(dtype) + + out = torch.nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p) + + out = out.transpose(1, 2).contiguous() + return out diff --git a/Meissonic/src/pipeline.py b/Meissonic/src/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..f82e41130368e7232d8d164e5560355993cc711f --- /dev/null +++ b/Meissonic/src/pipeline.py @@ -0,0 +1,370 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import torch +from transformers import CLIPTextModelWithProjection, CLIPTokenizer +from diffusers.image_processor import VaeImageProcessor +from diffusers.models import VQModel +from diffusers.utils import replace_example_docstring +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput + +from src.scheduler import Scheduler +from src.transformer import Transformer2DModel + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> image = pipe(prompt).images[0] + ``` +""" + + +def _prepare_latent_image_ids(batch_size, height, width, device, dtype): + latent_image_ids = torch.zeros(height // 2, width // 2, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :] + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + + latent_image_ids = latent_image_ids.reshape( + latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + + return latent_image_ids.to(device=device, dtype=dtype) + + +class Pipeline(DiffusionPipeline): + image_processor: VaeImageProcessor + vqvae: VQModel + tokenizer: CLIPTokenizer + text_encoder: CLIPTextModelWithProjection + transformer: Transformer2DModel + scheduler: Scheduler + # tokenizer_t5: T5Tokenizer + # text_encoder_t5: T5ForConditionalGeneration + + model_cpu_offload_seq = "text_encoder->transformer->vqvae" + + def __init__( + self, + vqvae: VQModel, + tokenizer: CLIPTokenizer, + text_encoder: CLIPTextModelWithProjection, + transformer: Transformer2DModel, + scheduler: Scheduler, + # tokenizer_t5: T5Tokenizer, + # text_encoder_t5: T5ForConditionalGeneration, + ): + super().__init__() + + self.register_modules( + vqvae=vqvae, + tokenizer=tokenizer, + text_encoder=text_encoder, + transformer=transformer, + scheduler=scheduler, + # tokenizer_t5=tokenizer_t5, + # text_encoder_t5=text_encoder_t5, + ) + self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False) + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[List[str], str]] = None, + height: Optional[int] = 1024, + width: Optional[int] = 1024, + num_inference_steps: int = 48, + guidance_scale: float = 9.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[torch.Generator] = None, + latents: Optional[torch.IntTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_encoder_hidden_states: Optional[torch.Tensor] = None, + output_type="pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + micro_conditioning_aesthetic_score: int = 6, + micro_conditioning_crop_coord: Tuple[int, int] = (0, 0), + temperature: Union[int, Tuple[int, int], List[int]] = (2, 0), + ): + """ + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.transformer.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 16): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 10.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.IntTensor`, *optional*): + Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image + gneration. If not provided, the starting latents will be completely masked. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. A single vector from the + pooled and projected final hidden states. + encoder_hidden_states (`torch.Tensor`, *optional*): + Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + negative_encoder_hidden_states (`torch.Tensor`, *optional*): + Analogous to `encoder_hidden_states` for the positive prompt. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): + Configures the temperature scheduler on `self.scheduler` see `Scheduler#set_timesteps`. + + Examples: + + Returns: + [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a + `tuple` is returned where the first element is a list with the generated images. + """ + if (prompt_embeds is not None and encoder_hidden_states is None) or ( + prompt_embeds is None and encoder_hidden_states is not None + ): + raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither") + + if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or ( + negative_prompt_embeds is None and negative_encoder_hidden_states is not None + ): + raise ValueError( + "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither" + ) + + if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None): + raise ValueError("pass only one of `prompt` or `prompt_embeds`") + + if isinstance(prompt, str): + prompt = [prompt] + + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + batch_size = batch_size * num_images_per_prompt + + if height is None: + height = self.transformer.config.sample_size * self.vae_scale_factor + + if width is None: + width = self.transformer.config.sample_size * self.vae_scale_factor + + if prompt_embeds is None: + input_ids = self.tokenizer( + prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + # input_ids_t5 = self.tokenizer_t5( + # prompt, + # return_tensors="pt", + # padding="max_length", + # truncation=True, + # max_length=512, + # ).input_ids.to(self._execution_device) + + + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + # outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True) + prompt_embeds = outputs.text_embeds + encoder_hidden_states = outputs.hidden_states[-2] + # encoder_hidden_states = outputs_t5.encoder_hidden_states[-2] + + prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1) + encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + if guidance_scale > 1.0: + if negative_prompt_embeds is None: + if negative_prompt is None: + negative_prompt = [""] * len(prompt) + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + + input_ids = self.tokenizer( + negative_prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + # input_ids_t5 = self.tokenizer_t5( + # prompt, + # return_tensors="pt", + # padding="max_length", + # truncation=True, + # max_length=512, + # ).input_ids.to(self._execution_device) + + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + # outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True) + negative_prompt_embeds = outputs.text_embeds + negative_encoder_hidden_states = outputs.hidden_states[-2] + # negative_encoder_hidden_states = outputs_t5.encoder_hidden_states[-2] + + + + negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1) + negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds]) + encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states]) + + # Note that the micro conditionings _do_ flip the order of width, height for the original size + # and the crop coordinates. This is how it was done in the original code base + micro_conds = torch.tensor( + [ + width, + height, + micro_conditioning_crop_coord[0], + micro_conditioning_crop_coord[1], + micro_conditioning_aesthetic_score, + ], + device=self._execution_device, + dtype=encoder_hidden_states.dtype, + ) + micro_conds = micro_conds.unsqueeze(0) + micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1) + + shape = (batch_size, height // self.vae_scale_factor, width // self.vae_scale_factor) + + if latents is None: + latents = torch.full( + shape, self.scheduler.config.mask_token_id, dtype=torch.long, device=self._execution_device + ) + + self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device) + + num_warmup_steps = len(self.scheduler.timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, timestep in enumerate(self.scheduler.timesteps): + if guidance_scale > 1.0: + model_input = torch.cat([latents] * 2) + else: + model_input = latents + if height == 1024: #args.resolution == 1024: + img_ids = _prepare_latent_image_ids(model_input.shape[0], model_input.shape[-2],model_input.shape[-1],model_input.device,model_input.dtype) + else: + img_ids = _prepare_latent_image_ids(model_input.shape[0],2*model_input.shape[-2],2*model_input.shape[-1],model_input.device,model_input.dtype) + txt_ids = torch.zeros(encoder_hidden_states.shape[1],3).to(device = encoder_hidden_states.device, dtype = encoder_hidden_states.dtype) + model_output = self.transformer( + hidden_states = model_input, + micro_conds=micro_conds, + pooled_projections=prompt_embeds, + encoder_hidden_states=encoder_hidden_states, + img_ids = img_ids, + txt_ids = txt_ids, + timestep = torch.tensor([timestep], device=model_input.device, dtype=torch.long), + # guidance = 7, + # cross_attention_kwargs=cross_attention_kwargs, + ) + + if guidance_scale > 1.0: + uncond_logits, cond_logits = model_output.chunk(2) + model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits) + + latents = self.scheduler.step( + model_output=model_output, + timestep=timestep, + sample=latents, + generator=generator, + ).prev_sample + + if i == len(self.scheduler.timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, timestep, latents) + + if output_type == "latent": + output = latents + else: + needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast + + if needs_upcasting: + self.vqvae.float() + + output = self.vqvae.decode( + latents, + force_not_quantize=True, + shape=( + batch_size, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + self.vqvae.config.latent_channels, + ), + ).sample.clip(0, 1) + output = self.image_processor.postprocess(output, output_type) + + if needs_upcasting: + self.vqvae.half() + + self.maybe_free_model_hooks() + + if not return_dict: + return (output,) + + return ImagePipelineOutput(output) \ No newline at end of file diff --git a/Meissonic/src/pipeline_img2img.py b/Meissonic/src/pipeline_img2img.py new file mode 100644 index 0000000000000000000000000000000000000000..acb67d6eb6d376fd22719f9401ff49d4a9354196 --- /dev/null +++ b/Meissonic/src/pipeline_img2img.py @@ -0,0 +1,337 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import torch +from transformers import CLIPTextModelWithProjection, CLIPTokenizer +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.models import UVit2DModel, VQModel +from diffusers.utils import replace_example_docstring +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput + +from src.scheduler import Scheduler +from src.transformer import Transformer2DModel +from src.pipeline import _prepare_latent_image_ids + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> image = pipe(prompt, input_image).images[0] + ``` +""" + +class Img2ImgPipeline(DiffusionPipeline): + image_processor: VaeImageProcessor + vqvae: VQModel + tokenizer: CLIPTokenizer + text_encoder: CLIPTextModelWithProjection + transformer: Transformer2DModel #UVit2DModel + scheduler: Scheduler + + model_cpu_offload_seq = "text_encoder->transformer->vqvae" + + # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before + # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter + # off the meta device. There should be a way to fix this instead of just not offloading it + _exclude_from_cpu_offload = ["vqvae"] + + def __init__( + self, + vqvae: VQModel, + tokenizer: CLIPTokenizer, + text_encoder: CLIPTextModelWithProjection, + transformer: Transformer2DModel, #UVit2DModel, + scheduler: Scheduler, + ): + super().__init__() + + self.register_modules( + vqvae=vqvae, + tokenizer=tokenizer, + text_encoder=text_encoder, + transformer=transformer, + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False) + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[List[str], str]] = None, + image: PipelineImageInput = None, + strength: float = 0.5, + num_inference_steps: int = 12, + guidance_scale: float = 10.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[torch.Generator] = None, + prompt_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_encoder_hidden_states: Optional[torch.Tensor] = None, + output_type="pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + micro_conditioning_aesthetic_score: int = 6, + micro_conditioning_crop_coord: Tuple[int, int] = (0, 0), + temperature: Union[int, Tuple[int, int], List[int]] = (2, 0), + ): + """ + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + strength (`float`, *optional*, defaults to 0.5): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 12): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 10.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. A single vector from the + pooled and projected final hidden states. + encoder_hidden_states (`torch.Tensor`, *optional*): + Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + negative_encoder_hidden_states (`torch.Tensor`, *optional*): + Analogous to `encoder_hidden_states` for the positive prompt. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): + Configures the temperature scheduler on `self.scheduler` see `Scheduler#set_timesteps`. + + Examples: + + Returns: + [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a + `tuple` is returned where the first element is a list with the generated images. + """ + + if (prompt_embeds is not None and encoder_hidden_states is None) or ( + prompt_embeds is None and encoder_hidden_states is not None + ): + raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither") + + if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or ( + negative_prompt_embeds is None and negative_encoder_hidden_states is not None + ): + raise ValueError( + "pass either both `negative_prompt_embeds` and `negative_encoder_hidden_states` or neither" + ) + + if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None): + raise ValueError("pass only one of `prompt` or `prompt_embeds`") + + if isinstance(prompt, str): + prompt = [prompt] + + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + batch_size = batch_size * num_images_per_prompt + + if prompt_embeds is None: + input_ids = self.tokenizer( + prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + prompt_embeds = outputs.text_embeds + encoder_hidden_states = outputs.hidden_states[-2] + + prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1) + encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + if guidance_scale > 1.0: + if negative_prompt_embeds is None: + if negative_prompt is None: + negative_prompt = [""] * len(prompt) + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + + input_ids = self.tokenizer( + negative_prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + negative_prompt_embeds = outputs.text_embeds + negative_encoder_hidden_states = outputs.hidden_states[-2] + + negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1) + negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds]) + encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states]) + + image = self.image_processor.preprocess(image) + + height, width = image.shape[-2:] + + # Note that the micro conditionings _do_ flip the order of width, height for the original size + # and the crop coordinates. This is how it was done in the original code base + micro_conds = torch.tensor( + [ + width, + height, + micro_conditioning_crop_coord[0], + micro_conditioning_crop_coord[1], + micro_conditioning_aesthetic_score, + ], + device=self._execution_device, + dtype=encoder_hidden_states.dtype, + ) + + micro_conds = micro_conds.unsqueeze(0) + micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1) + + self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device) + num_inference_steps = int(len(self.scheduler.timesteps) * strength) + start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps + + needs_upcasting = False # = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast + + if needs_upcasting: + self.vqvae.float() + + latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents + latents_bsz, channels, latents_height, latents_width = latents.shape + latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width) + latents = self.scheduler.add_noise( + latents, self.scheduler.timesteps[start_timestep_idx - 1], generator=generator + ) + latents = latents.repeat(num_images_per_prompt, 1, 1) + + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i in range(start_timestep_idx, len(self.scheduler.timesteps)): + timestep = self.scheduler.timesteps[i] + + if guidance_scale > 1.0: + model_input = torch.cat([latents] * 2) + else: + model_input = latents + if height == 1024: #args.resolution == 1024: + img_ids = _prepare_latent_image_ids(model_input.shape[0], model_input.shape[-2],model_input.shape[-1],model_input.device,model_input.dtype) + else: + img_ids = _prepare_latent_image_ids(model_input.shape[0],2*model_input.shape[-2],2*model_input.shape[-1],model_input.device,model_input.dtype) + txt_ids = torch.zeros(encoder_hidden_states.shape[1],3).to(device = encoder_hidden_states.device, dtype = encoder_hidden_states.dtype) + model_output = self.transformer( + model_input, + micro_conds=micro_conds, + pooled_projections=prompt_embeds, + encoder_hidden_states=encoder_hidden_states, + # cross_attention_kwargs=cross_attention_kwargs, + img_ids = img_ids, + txt_ids = txt_ids, + timestep = torch.tensor([timestep], device=model_input.device, dtype=torch.long), + ) + + if guidance_scale > 1.0: + uncond_logits, cond_logits = model_output.chunk(2) + model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits) + + latents = self.scheduler.step( + model_output=model_output, + timestep=timestep, + sample=latents, + generator=generator, + ).prev_sample + + if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, timestep, latents) + + if output_type == "latent": + output = latents + else: + output = self.vqvae.decode( + latents, + force_not_quantize=True, + shape=( + batch_size, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + self.vqvae.config.latent_channels, + ), + ).sample.clip(0, 1) + output = self.image_processor.postprocess(output, output_type) + + if needs_upcasting: + self.vqvae.half() + + self.maybe_free_model_hooks() + + if not return_dict: + return (output,) + + return ImagePipelineOutput(output) diff --git a/Meissonic/src/pipeline_inpaint.py b/Meissonic/src/pipeline_inpaint.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8a08b0dcf357660ac9038e59b05a95eefb94a4 --- /dev/null +++ b/Meissonic/src/pipeline_inpaint.py @@ -0,0 +1,361 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import torch +from transformers import CLIPTextModelWithProjection, CLIPTokenizer +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.models import VQModel +from diffusers.utils import replace_example_docstring +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from src.scheduler import Scheduler +from src.transformer import Transformer2DModel +from src.pipeline import _prepare_latent_image_ids + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> pipe(prompt, input_image, mask).images[0].save("out.png") + ``` +""" + +class InpaintPipeline(DiffusionPipeline): + image_processor: VaeImageProcessor + vqvae: VQModel + tokenizer: CLIPTokenizer + text_encoder: CLIPTextModelWithProjection + transformer: Transformer2DModel #UVit2DModel + scheduler: Scheduler + + model_cpu_offload_seq = "text_encoder->transformer->vqvae" + + # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before + # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter + # off the meta device. There should be a way to fix this instead of just not offloading it + _exclude_from_cpu_offload = ["vqvae"] + + def __init__( + self, + vqvae: VQModel, + tokenizer: CLIPTokenizer, + text_encoder: CLIPTextModelWithProjection, + transformer: Transformer2DModel, #UVit2DModel, + scheduler: Scheduler, + ): + super().__init__() + + self.register_modules( + vqvae=vqvae, + tokenizer=tokenizer, + text_encoder=text_encoder, + transformer=transformer, + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False) + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, + do_normalize=False, + do_binarize=True, + do_convert_grayscale=True, + do_resize=True, + ) + self.scheduler.register_to_config(masking_schedule="linear") + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[List[str], str]] = None, + image: PipelineImageInput = None, + mask_image: PipelineImageInput = None, + strength: float = 1.0, + num_inference_steps: int = 12, + guidance_scale: float = 10.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[torch.Generator] = None, + prompt_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_encoder_hidden_states: Optional[torch.Tensor] = None, + output_type="pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + micro_conditioning_aesthetic_score: int = 6, + micro_conditioning_crop_coord: Tuple[int, int] = (0, 0), + temperature: Union[int, Tuple[int, int], List[int]] = (2, 0), + ): + """ + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask + are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a + single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one + color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, + 1)`, or `(H, W)`. + strength (`float`, *optional*, defaults to 1.0): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 16): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 10.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. A single vector from the + pooled and projected final hidden states. + encoder_hidden_states (`torch.Tensor`, *optional*): + Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + negative_encoder_hidden_states (`torch.Tensor`, *optional*): + Analogous to `encoder_hidden_states` for the positive prompt. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): + Configures the temperature scheduler on `self.scheduler` see `Scheduler#set_timesteps`. + + Examples: + + Returns: + [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a + `tuple` is returned where the first element is a list with the generated images. + """ + + if (prompt_embeds is not None and encoder_hidden_states is None) or ( + prompt_embeds is None and encoder_hidden_states is not None + ): + raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither") + + if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or ( + negative_prompt_embeds is None and negative_encoder_hidden_states is not None + ): + raise ValueError( + "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither" + ) + + if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None): + raise ValueError("pass only one of `prompt` or `prompt_embeds`") + + if isinstance(prompt, str): + prompt = [prompt] + + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + batch_size = batch_size * num_images_per_prompt + + if prompt_embeds is None: + input_ids = self.tokenizer( + prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + prompt_embeds = outputs.text_embeds + encoder_hidden_states = outputs.hidden_states[-2] + + prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1) + encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + if guidance_scale > 1.0: + if negative_prompt_embeds is None: + if negative_prompt is None: + negative_prompt = [""] * len(prompt) + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + + input_ids = self.tokenizer( + negative_prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + negative_prompt_embeds = outputs.text_embeds + negative_encoder_hidden_states = outputs.hidden_states[-2] + + negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1) + negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds]) + encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states]) + + image = self.image_processor.preprocess(image) + + height, width = image.shape[-2:] + + # Note that the micro conditionings _do_ flip the order of width, height for the original size + # and the crop coordinates. This is how it was done in the original code base + micro_conds = torch.tensor( + [ + width, + height, + micro_conditioning_crop_coord[0], + micro_conditioning_crop_coord[1], + micro_conditioning_aesthetic_score, + ], + device=self._execution_device, + dtype=encoder_hidden_states.dtype, + ) + + micro_conds = micro_conds.unsqueeze(0) + micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1) + + self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device) + num_inference_steps = int(len(self.scheduler.timesteps) * strength) + start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps + + needs_upcasting = False #self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast + + if needs_upcasting: + self.vqvae.float() + + latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents + latents_bsz, channels, latents_height, latents_width = latents.shape + latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width) + + mask = self.mask_processor.preprocess( + mask_image, height // self.vae_scale_factor, width // self.vae_scale_factor + ) + mask = mask.reshape(mask.shape[0], latents_height, latents_width).bool().to(latents.device) + latents[mask] = self.scheduler.config.mask_token_id + + starting_mask_ratio = mask.sum() / latents.numel() + + latents = latents.repeat(num_images_per_prompt, 1, 1) + + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i in range(start_timestep_idx, len(self.scheduler.timesteps)): + timestep = self.scheduler.timesteps[i] + + if guidance_scale > 1.0: + model_input = torch.cat([latents] * 2) + else: + model_input = latents + + if height == 1024: #args.resolution == 1024: + img_ids = _prepare_latent_image_ids(model_input.shape[0], model_input.shape[-2],model_input.shape[-1],model_input.device,model_input.dtype) + else: + img_ids = _prepare_latent_image_ids(model_input.shape[0],2*model_input.shape[-2],2*model_input.shape[-1],model_input.device,model_input.dtype) + txt_ids = torch.zeros(encoder_hidden_states.shape[1],3).to(device = encoder_hidden_states.device, dtype = encoder_hidden_states.dtype) + model_output = self.transformer( + model_input, + micro_conds=micro_conds, + pooled_projections=prompt_embeds, + encoder_hidden_states=encoder_hidden_states, + # cross_attention_kwargs=cross_attention_kwargs, + img_ids = img_ids, + txt_ids = txt_ids, + timestep = torch.tensor([timestep], device=model_input.device, dtype=torch.long), + ) + + if guidance_scale > 1.0: + uncond_logits, cond_logits = model_output.chunk(2) + model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits) + + latents = self.scheduler.step( + model_output=model_output, + timestep=timestep, + sample=latents, + generator=generator, + starting_mask_ratio=starting_mask_ratio, + ).prev_sample + + if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, timestep, latents) + + if output_type == "latent": + output = latents + else: + output = self.vqvae.decode( + latents, + force_not_quantize=True, + shape=( + batch_size, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + self.vqvae.config.latent_channels, + ), + ).sample.clip(0, 1) + output = self.image_processor.postprocess(output, output_type) + + if needs_upcasting: + self.vqvae.half() + + self.maybe_free_model_hooks() + + if not return_dict: + return (output,) + + return ImagePipelineOutput(output) diff --git a/Meissonic/src/pipeline_video.py b/Meissonic/src/pipeline_video.py new file mode 100644 index 0000000000000000000000000000000000000000..dca2c2e3cf77741559c113b612d46a6bd1fb28fc --- /dev/null +++ b/Meissonic/src/pipeline_video.py @@ -0,0 +1,1138 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os +# Add project root to path to allow imports when running as script +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from dataclasses import dataclass +import torch +import torch.nn as nn +from transformers import T5Tokenizer, T5EncoderModel +from diffusers.utils import replace_example_docstring +from diffusers.pipelines.pipeline_utils import DiffusionPipeline + +from src.scheduler_video import Scheduler +from src.transformer_video import WanDiscreteVideoTransformer + +# Global debug flag - set to False to disable debug prints +DEBUG_PIPELINE = False + + +@dataclass +class VideoPipelineOutput: + """ + Output class for video generation pipelines. + + Args: + videos: Generated videos. Can be: + - torch.Tensor of shape [B, C, F, H, W] when output_type="pt" + - List of tensors (one per batch) when output_type="latent" + - numpy.ndarray when output_type="np" + - List of PIL Images (frames) when output_type="pil" + """ + videos: Union[torch.Tensor, List[torch.Tensor], List, Any] + + +class CosmosVideoTokenizer(nn.Module): + """ + Wrapper around a Cosmos DV (Discrete Video) tokenizer for video encoding/decoding. + + This class provides a clean interface to encode videos into discrete codes and decode + them back to video tensors. It wraps the Cosmos DV tokenizer models loaded from HuggingFace. + + Attributes: + t_downsample (int): Temporal compression factor (frames downsampled by this factor). + h_downsample (int): Height compression factor (height downsampled by this factor). + w_downsample (int): Width compression factor (width downsampled by this factor). + codebook_size (int): Number of unique discrete codes in the codebook. + mask_token_id (int): Token ID used for masking during diffusion. Set to codebook_size, + meaning valid token indices are [0, codebook_size-1], and codebook_size + is reserved for masking. + """ + + def __init__(self, model_id: str, device: torch.device, dtype: torch.dtype): + """ + Initialize the Cosmos DV video tokenizer. + + Args: + model_id (str): HuggingFace model identifier (e.g., "Cosmos-Tokenizer-DV8x16x16-720p"). + Can be a full repo_id like "nvidia/Cosmos-0.1-Tokenizer-DV8x16x16". + device (torch.device): Device to load the tokenizer on (e.g., "cuda" or "cpu"). + dtype (torch.dtype): Data type for the tokenizer (e.g., torch.float32, torch.bfloat16). + """ + super().__init__() + self.device = device + self.dtype = dtype + self.model_id = model_id + + # Try to load from HuggingFace Hub + try: + from huggingface_hub import snapshot_download + import os + + # Handle both full repo_id and just model name + if "/" not in model_id: + repo_id = f"nvidia/{model_id}" + else: + repo_id = model_id + + # Download the model + local_dir = f"pretrained_ckpts/{model_id.replace('/', '_')}" + os.makedirs(local_dir, exist_ok=True) + snapshot_download(repo_id=repo_id, local_dir=local_dir) + + # Try loading as torch.jit models + encoder_path = f"{local_dir}/encoder.jit" + decoder_path = f"{local_dir}/decoder.jit" + + if os.path.exists(encoder_path) and os.path.exists(decoder_path): + # Load models in float32 (TorchScript models often don't support dtype conversion well) + # We'll convert inputs to float32 when needed + self.encoder = torch.jit.load(encoder_path).to(device).eval() + self.decoder = torch.jit.load(decoder_path).to(device).eval() + # Store the model dtype (typically float32 for TorchScript) + self.model_dtype = torch.float32 + else: + # Try alternative loading methods (e.g., from diffusers or transformers) + raise FileNotFoundError(f"Could not find encoder.jit or decoder.jit in {local_dir}") + + except Exception as e: + # Fallback: try loading via diffusers or other methods + try: + # Alternative: try AutoModel or other loading methods + # This is a placeholder - adjust based on actual Cosmos tokenizer API + raise NotImplementedError( + f"Failed to load Cosmos tokenizer from {model_id}. " + f"Error: {e}. Please ensure the model is available on HuggingFace Hub." + ) + except Exception as e2: + raise RuntimeError( + f"Could not load Cosmos tokenizer: {e}. " + f"Please check that the model_id '{model_id}' is correct and accessible." + ) from e2 + + # Compression factors for DV8x16x16 model + # These values depend on the specific model architecture + # For Cosmos-Tokenizer-DV8x16x16-720p: + # - Temporal: 8x downsampling (16 frames -> 2 frames) + # - Spatial: 16x16 downsampling (480x848 -> 30x53) + self.t_downsample = 4 #8 # Temporal compression factor + self.h_downsample = 8 #16 # Height compression factor + self.w_downsample = 8 #16 # Width compression factor + self.codebook_size = 64000 #65536 # Number of unique codes (2^16) + + # Mask token ID: codebook_size is reserved for masking during diffusion + # This ensures all Cosmos codes [0, codebook_size-1] remain valid + # Extended vocab: [0, codebook_size-1] = valid codes, codebook_size = mask_token_id + self.mask_token_id = self.codebook_size + + def encode(self, video: torch.Tensor) -> torch.LongTensor: + """ + Encode a video tensor into discrete code indices. + + Args: + video (torch.Tensor): Input video tensor of shape [B, C, F, H, W]. + Values should be in [0, 1] or [0, 255]. + If values are > 1.0, they will be normalized to [0, 1]. + + Returns: + torch.LongTensor: Encoded discrete codes of shape [B, F', H', W'] where: + - F' ≈ F // t_downsample (temporal dimension after compression, may vary slightly due to padding) + - H' = H // h_downsample (height after compression) + - W' = W // w_downsample (width after compression) + + Note: + The actual output shape may differ slightly from the theoretical compression + due to model-specific padding or overlap behavior. + """ + # Normalize video to [0, 1] if necessary + if video.max() > 1.0: + video = video / 255.0 + + # Ensure video is on correct device and convert to model dtype (typically float32) + # TorchScript models often require float32 inputs + video = video.to(self.device).to(self.model_dtype) + + # Encode the video + with torch.no_grad(): + # The encoder typically returns (indices, ...) or just indices + # Adjust based on actual Cosmos encoder API + if hasattr(self.encoder, 'encode'): + result = self.encoder.encode(video) + if isinstance(result, tuple): + indices = result[0] + else: + indices = result + else: + # Direct call if encoder is a callable model + result = self.encoder(video) + if isinstance(result, tuple): + indices = result[0] + else: + indices = result + + # Ensure indices are LongTensor + if not isinstance(indices, torch.LongTensor): + indices = indices.long() + + return indices + + def decode(self, codes: torch.LongTensor) -> torch.Tensor: + """ + Decode discrete code indices back into a video tensor. + + Args: + codes (torch.LongTensor): Encoded discrete codes of shape [B, F', H', W']. + + Returns: + torch.Tensor: Reconstructed video tensor of shape [B, C, F, H, W] where: + - F ≈ F' * t_downsample (may vary slightly due to model-specific behavior) + - H = H' * h_downsample + - W = W' * w_downsample + Values are in [0, 1] range. + + Note: + The output frame count may differ slightly from the original input due to + model-specific temporal interpolation or padding behavior. + """ + # Ensure codes are on correct device + codes = codes.to(self.device) + + # Decode the codes + with torch.no_grad(): + if hasattr(self.decoder, 'decode'): + reconstructed_video = self.decoder.decode(codes) + else: + # Direct call if decoder is a callable model + reconstructed_video = self.decoder(codes) + + # Ensure output is in [0, 1] range and convert to desired dtype + reconstructed_video = torch.clamp(reconstructed_video, 0.0, 1.0) + + # Convert to the tokenizer's dtype if different from model dtype + if reconstructed_video.dtype != self.dtype: + reconstructed_video = reconstructed_video.to(self.dtype) + + return reconstructed_video + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> image = pipe(prompt).images[0] + ``` +""" + + +class Pipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + transformer: WanDiscreteVideoTransformer + scheduler: Scheduler + video_tokenizer: CosmosVideoTokenizer + + model_cpu_offload_seq = "text_encoder->transformer->video_tokenizer" + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + transformer: WanDiscreteVideoTransformer, + scheduler: Scheduler, + video_tokenizer: CosmosVideoTokenizer, + text_len: int = 512, + num_frames: int = 16, + height: int = 480, + width: int = 848, + ): + """ + Initialize the video diffusion pipeline. + + Args: + tokenizer (T5Tokenizer): Wan-style T5 tokenizer (UMT5) for text encoding. + text_encoder (T5EncoderModel): Wan-style T5 encoder (UMT5-base outputs 768, UMT5-large outputs 4096). + transformer (WanDiscreteVideoTransformer): The discrete video transformer model + that handles token embedding and logits prediction. Supports dynamic input dimensions. + scheduler (Scheduler): The diffusion scheduler. + video_tokenizer (CosmosVideoTokenizer): Cosmos DV tokenizer + for video encoding/decoding. Required for video generation. + text_len (int): Maximum text sequence length (default: 512). + num_frames (int): Default number of frames in the video (default: 16). + Can be overridden in __call__. Must be divisible by tokenizer's t_downsample (8). + height (int): Default height of the video in pixels (default: 480). + Can be overridden in __call__. Must be divisible by tokenizer's h_downsample (16). + width (int): Default width of the video in pixels (default: 848). + Can be overridden in __call__. Must be divisible by tokenizer's w_downsample (16). + + Note: + The transformer now supports dynamic input dimensions, so users can generate videos + with different frame counts and resolutions by specifying them in __call__(). + """ + super().__init__() + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + transformer=transformer, + scheduler=scheduler, + video_tokenizer=video_tokenizer, + ) + self.text_len = text_len + # Store default video dimensions (can be overridden in __call__) + self.num_frames = num_frames + self.height = height + self.width = width + + # Get codebook size from video tokenizer + self.codebook_size = video_tokenizer.codebook_size + + # IMPORTANT: Index mapping semantics for discrete diffusion + # ============================================================ + # Cosmos tokenizer outputs indices in [0, codebook_size-1] (all valid codes, no mask) + # We extend the vocab by adding mask_token_id = codebook_size + # + # Model vocab: [0, codebook_size] where: + # - [0, codebook_size-1] = actual Cosmos codes (direct mapping, no shift needed) + # - codebook_size = mask_token_id (reserved for masking during diffusion) + # + # This design ensures: + # - All Cosmos codes remain valid (no information loss) + # - mask_token_id is outside the codebook range, safe for masking + # - No need for +1/-1 mapping, Cosmos codes map directly to [0, codebook_size-1] + # + # When decoding back to Cosmos: + # - Model outputs in [0, codebook_size] (may contain codebook_size=mask) + # - Filter out mask tokens: clamp to [0, codebook_size-1] + # - This ensures Cosmos only sees valid codes [0, codebook_size-1] + # ============================================================ + + # Set mask_token_id to codebook_size (outside valid code range) + # This will be used by scheduler + self.mask_token_id = self.codebook_size + + # Calculate default compressed dimensions + # These are used as defaults in __call__ and can be overridden + self.F_prime = num_frames // video_tokenizer.t_downsample + self.H_prime = height // video_tokenizer.h_downsample + self.W_prime = width // video_tokenizer.w_downsample + + def _encode_prompt_wan( + self, + prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Encode prompt(s) using Wan's T5 text encoder. + + Args: + prompt (Union[str, List[str]]): The prompt or prompts to encode. + negative_prompt (Optional[Union[str, List[str]]]): The negative prompt(s) for CFG. + num_images_per_prompt (int): Number of images to generate per prompt. + do_classifier_free_guidance (bool): Whether to use classifier-free guidance. + + Returns: + Tuple[torch.Tensor, Optional[torch.Tensor]]: + - encoder_hidden_states: [B_total, L_text, D_text] where B_total = B * num_images_per_prompt + - encoder_hidden_states_neg: [B_total, L_text, D_text] for CFG, or None if not using CFG + """ + if isinstance(prompt, str): + prompt = [prompt] + + # Tokenize prompts + input_ids = self.tokenizer( + prompt, + padding="max_length", + truncation=True, + max_length=self.text_len, + return_tensors="pt" + )["input_ids"].to(self._execution_device) + + # Encode prompts + with torch.no_grad(): + outputs = self.text_encoder(input_ids, return_dict=True) + encoder_hidden_states = outputs.last_hidden_state # [B, L_text, D_text] + + # Repeat for num_images_per_prompt + encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + + # Handle negative prompts for CFG + encoder_hidden_states_neg = None + if do_classifier_free_guidance: + if negative_prompt is None: + negative_prompt = [""] * len(prompt) + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + + # Tokenize negative prompts + negative_input_ids = self.tokenizer( + negative_prompt, + padding="max_length", + truncation=True, + max_length=self.text_len, + return_tensors="pt" + )["input_ids"].to(self._execution_device) + + # Encode negative prompts + with torch.no_grad(): + negative_outputs = self.text_encoder(negative_input_ids, return_dict=True) + encoder_hidden_states_neg = negative_outputs.last_hidden_state # [B, L_text, D_text] + + # Repeat for num_images_per_prompt + encoder_hidden_states_neg = encoder_hidden_states_neg.repeat(num_images_per_prompt, 1, 1) + + # Assertions for shape verification + B_total = len(prompt) * num_images_per_prompt + assert encoder_hidden_states.shape == (B_total, self.text_len, encoder_hidden_states.shape[-1]), ( + f"Expected encoder_hidden_states shape ({B_total}, {self.text_len}, D_text), " + f"got {encoder_hidden_states.shape}" + ) + if encoder_hidden_states_neg is not None: + assert encoder_hidden_states_neg.shape == (B_total, self.text_len, encoder_hidden_states_neg.shape[-1]), ( + f"Expected encoder_hidden_states_neg shape ({B_total}, {self.text_len}, D_text), " + f"got {encoder_hidden_states_neg.shape}" + ) + + return encoder_hidden_states, encoder_hidden_states_neg + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[List[str], str]] = None, + num_frames: Optional[int] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 48, + guidance_scale: float = 9.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[torch.Generator] = None, + latents: Optional[torch.IntTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + negative_encoder_hidden_states: Optional[torch.Tensor] = None, + output_type="pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + micro_conditioning_aesthetic_score: int = 6, + micro_conditioning_crop_coord: Tuple[int, int] = (0, 0), + temperature: Union[int, Tuple[int, int], List[int]] = (2, 0), + ): + """ + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `encoder_hidden_states`. + num_frames (`int`, *optional*, defaults to `self.num_frames`): + Number of frames in the generated video. + height (`int`, *optional*, defaults to `self.height`): + The height in pixels of the generated video. + width (`int`, *optional*, defaults to `self.width`): + The width in pixels of the generated video. + num_inference_steps (`int`, *optional*, defaults to 16): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 10.0): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.LongTensor`, *optional*): + Pre-generated 3D video codes of shape `[B, F', H', W']` where F', H', W' are the compressed + dimensions after Cosmos tokenization. If not provided, the starting codes will be completely + masked (filled with mask_token_id). + encoder_hidden_states (`torch.Tensor`, *optional*): + Pre-generated encoder hidden states from the T5 text encoder. If not provided, will be generated + from the `prompt` input argument using Wan's T5 encoder. + negative_encoder_hidden_states (`torch.Tensor`, *optional*): + Pre-generated negative encoder hidden states for classifier-free guidance. If not provided, will be + generated from the `negative_prompt` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. + temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): + Configures the temperature scheduler on `self.scheduler` see `Scheduler#set_timesteps`. + + Examples: + + Returns: + [`VideoPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`VideoPipelineOutput`] is returned, otherwise a + `tuple` is returned where the first element contains the generated videos. + The output format depends on `output_type`: + - `"latent"`: Discrete code tensor of shape `[B, F', H', W']` + - `"pt"`: Float tensor of shape `[B, C, F, H, W]` with values in [0, 1] + - `"np"`: Numpy array of shape `[B, C, F, H, W]` + - `"pil"`: List of lists of PIL Images (one list per batch, each containing frames) + """ + # Validate inputs + if prompt is None and encoder_hidden_states is None: + raise ValueError("Either `prompt` or `encoder_hidden_states` must be provided") + + if prompt is not None and encoder_hidden_states is not None: + raise ValueError("Cannot pass both `prompt` and `encoder_hidden_states`") + + # Determine batch size + if prompt is not None: + if isinstance(prompt, str): + prompt = [prompt] + batch_size = len(prompt) + else: + batch_size = encoder_hidden_states.shape[0] + + batch_size = batch_size * num_images_per_prompt + + # Use provided dimensions or fall back to defaults + if num_frames is None: + num_frames = self.num_frames + if height is None: + height = self.height + if width is None: + width = self.width + + # Validate dimensions are divisible by tokenizer's downsampling factors + t_ds = self.video_tokenizer.t_downsample + h_ds = self.video_tokenizer.h_downsample + w_ds = self.video_tokenizer.w_downsample + + # if num_frames % t_ds != 0: + # raise ValueError( + # f"num_frames ({num_frames}) must be divisible by temporal downsampling factor ({t_ds})" + # ) + # if height % h_ds != 0: + # raise ValueError( + # f"height ({height}) must be divisible by height downsampling factor ({h_ds})" + # ) + # if width % w_ds != 0: + # raise ValueError( + # f"width ({width}) must be divisible by width downsampling factor ({w_ds})" + # ) + + # Calculate compressed dimensions for this generation + F_prime = num_frames // t_ds + H_prime = height // h_ds + W_prime = width // w_ds + + # Encode prompts using Wan's T5 encoder + do_classifier_free_guidance = guidance_scale > 1.0 + + if encoder_hidden_states is None: + encoder_hidden_states, encoder_hidden_states_neg = self._encode_prompt_wan( + prompt=prompt, + negative_prompt=negative_prompt, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + ) + else: + # Use pre-computed encoder_hidden_states + encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + if do_classifier_free_guidance: + if negative_encoder_hidden_states is None: + raise ValueError("`negative_encoder_hidden_states` must be provided when using guidance_scale > 1.0") + encoder_hidden_states_neg = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) + else: + encoder_hidden_states_neg = None + + # Stack negative and positive for classifier-free guidance + if do_classifier_free_guidance: + # Stack [negative, positive] along batch dimension + encoder_hidden_states = torch.cat([encoder_hidden_states_neg, encoder_hidden_states], dim=0) + # Verify shape: should be (2 * batch_size, text_len, text_dim) + assert encoder_hidden_states.shape[0] == 2 * batch_size, ( + f"Expected batch size {2 * batch_size} after CFG stacking, got {encoder_hidden_states.shape[0]}" + ) + + # Initialize 3D video codes: [B, F', H', W'] + # Note: latents_codes use extended vocab [0, codebook_size] where: + # - [0, codebook_size-1] = valid Cosmos codes (direct mapping) + # - codebook_size = mask_token_id + # If provided latents are from Cosmos (range [0, codebook_size-1]), they're already correct + if latents is None: + # Start with all mask tokens (codebook_size) in extended vocab space + latents_codes = torch.full( + (batch_size, F_prime, H_prime, W_prime), + self.mask_token_id, # codebook_size = mask_token_id + dtype=torch.long, + device=self._execution_device + ) + else: + # If latents are provided, assume they are already in extended vocab format [0, codebook_size] + # Cosmos codes [0, codebook_size-1] map directly, no shift needed + latents_codes = latents + assert latents_codes.shape[1:] == (F_prime, H_prime, W_prime), ( + f"Expected latents shape [B, {F_prime}, {H_prime}, {W_prime}], " + f"got {latents_codes.shape}" + ) + # Verify values are in extended vocab range [0, codebook_size] + assert latents_codes.min() >= 0 and latents_codes.max() <= self.codebook_size, ( + f"Latents values should be in [0, {self.codebook_size}] (extended vocab), " + f"got range [{latents_codes.min()}, {latents_codes.max()}]" + ) + + # Print initial latents shape for debugging + if DEBUG_PIPELINE: + print(f"Initial latents_codes shape: {latents_codes.shape} [B, F', H', W']") + + self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device) + + num_warmup_steps = len(self.scheduler.timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, timestep in enumerate(self.scheduler.timesteps): + # Handle classifier-free guidance: duplicate codes if needed + # IMPORTANT: Always build latents_codes_input from latents_codes (not from previous iteration's latents_codes_input) + if guidance_scale > 1.0: + latents_codes_input = torch.cat([latents_codes] * 2, dim=0) + batch_size_total = 2 * batch_size + if DEBUG_PIPELINE: + print(f"[DEBUG] CFG: latents_codes.shape={latents_codes.shape}, latents_codes_input.shape={latents_codes_input.shape}, encoder_hidden_states.shape={encoder_hidden_states.shape}") + else: + latents_codes_input = latents_codes + batch_size_total = batch_size + if DEBUG_PIPELINE: + print(f"[DEBUG] latents_codes.shape={latents_codes.shape}, latents_codes_input.shape={latents_codes_input.shape}, encoder_hidden_states.shape={encoder_hidden_states.shape}") + + # Verify shapes before transformer call + assert latents_codes_input.shape[0] == batch_size_total, ( + f"latents_codes_input batch mismatch: {latents_codes_input.shape[0]} != {batch_size_total}" + ) + assert encoder_hidden_states.shape[0] == batch_size_total, ( + f"encoder_hidden_states batch mismatch: {encoder_hidden_states.shape[0]} != {batch_size_total}" + ) + + # Prepare timestep tensor: [B_total] + timestep_tensor = torch.full( + (batch_size_total,), + timestep, + dtype=torch.long, + device=self._execution_device + ) + + # Call transformer + if DEBUG_PIPELINE: + print(f"[DEBUG] Before transformer: tokens.shape={latents_codes_input.shape}, timesteps.shape={timestep_tensor.shape}, encoder_hidden_states.shape={encoder_hidden_states.shape}") + logits = self.transformer( + tokens=latents_codes_input, + timesteps=timestep_tensor, + encoder_hidden_states=encoder_hidden_states, + y=None, + ) + if DEBUG_PIPELINE: + print(f"[DEBUG] After transformer: logits.shape={logits.shape}, expected batch={batch_size_total}") + + # Verify logits shape matches expected token count + # logits: [B_total, vocab_size, F_out, H_out, W_out] where vocab_size = codebook_size + 1 + # latents_codes: [B_total, F', H', W'] with values in [0, vocab_size-1] + vocab_size = self.codebook_size + 1 + assert logits.shape[0] == batch_size_total, ( + f"Logits batch size mismatch: {logits.shape[0]} != {batch_size_total}" + ) + assert logits.shape[1] == vocab_size, ( + f"Logits vocab size mismatch: {logits.shape[1]} != {vocab_size} (expected codebook_size+1)" + ) + + # Apply classifier-free guidance if needed + # logits shape: [B_total, vocab_size, F_out, H_out, W_out] + if guidance_scale > 1.0: + uncond_logits, cond_logits = logits.chunk(2, dim=0) + logits = uncond_logits + guidance_scale * (cond_logits - uncond_logits) + # After CFG, logits batch becomes batch_size (not 2*batch_size) + # We'll use latents_codes (not latents_codes_input) for flattening, so no need to update it here + + # Flatten video tokens for scheduler: [B, F, H, W] -> [B, N] where N = F*H*W + # Scheduler expects 1D token sequences, so we flatten the spatial-temporal dimensions + # Use logits.shape[0] as the batch size (after CFG, this is batch_size, not batch_size_total) + B_flat, vocab_size, F_flat, H_flat, W_flat = logits.shape + N = F_flat * H_flat * W_flat + + # After CFG, logits batch is batch_size (not 2*batch_size) + # Use latents_codes (not latents_codes_input) for flattening, since latents_codes is always [B, F, H, W] + if DEBUG_PIPELINE: + print(f"[DEBUG] After CFG: logits.shape={logits.shape}, latents_codes.shape={latents_codes.shape}") + + # Handle shape mismatch: transformer output may have different spatial dimensions due to patch_size + # Crop or pad latents_codes to match logits dimensions + if latents_codes.shape[1:] != (F_flat, H_flat, W_flat): + if DEBUG_PIPELINE: + print(f"[DEBUG] Shape mismatch detected: latents_codes {latents_codes.shape[1:]} != logits {logits.shape[2:]}, adjusting...") + B_lat, F_lat, H_lat, W_lat = latents_codes.shape + + # Create a new tensor with the correct shape, filled with mask_token_id + old_shape = latents_codes.shape + new_latents_codes = torch.full( + (B_flat, F_flat, H_flat, W_flat), + self.mask_token_id, + dtype=latents_codes.dtype, + device=latents_codes.device + ) + + # Copy overlapping region from latents_codes + F_copy = min(F_lat, F_flat) + H_copy = min(H_lat, H_flat) + W_copy = min(W_lat, W_flat) + new_latents_codes[:, :F_copy, :H_copy, :W_copy] = latents_codes[:, :F_copy, :H_copy, :W_copy] + + latents_codes = new_latents_codes + if DEBUG_PIPELINE: + print(f"[DEBUG] Adjusted latents_codes from {old_shape} to {latents_codes.shape} (copied {F_copy}x{H_copy}x{W_copy} region)") + + # Verify shapes match after adjustment + assert latents_codes.shape == (B_flat, F_flat, H_flat, W_flat), ( + f"Shape mismatch after adjustment: logits [B, vocab, F, H, W]={logits.shape}, " + f"latents_codes [B, F, H, W]={latents_codes.shape}" + ) + assert logits.numel() == B_flat * vocab_size * N, ( + f"logits.numel: {logits.numel()}, reshape target: {B_flat*vocab_size*N}" + ) + + tokens_flat = latents_codes.view(B_flat, N) # [B, N] + logits_flat = logits.permute(0, 2, 3, 4, 1).reshape(B_flat, N, vocab_size) # [B, N, vocab_size] + assert (tokens_flat >= 0).all() and (tokens_flat < vocab_size).all(), ( + f"[DEBUG] Out-of-range token: min={tokens_flat.min().item()}, max={tokens_flat.max().item()}, vocab_size={vocab_size}" + ) + + # Scheduler step: update discrete codes based on logits + # Scheduler works on 1D token sequences [B_total, N] with logits [B_total, N, vocab] + scheduler_output = self.scheduler.step( + model_output=logits_flat, # [B_total, N, vocab] + timestep=timestep, + sample=tokens_flat, # [B_total, N] + generator=generator, + ) + + # Unflatten back to video grid: [B, N] -> [B, F, H, W] + # Note: After CFG, B_flat = batch_size (not 2*batch_size), so latents_codes_updated is already [B, F, H, W] + tokens_flat = scheduler_output.prev_sample # [B, N] + latents_codes = tokens_flat.view(B_flat, F_flat, H_flat, W_flat) # [B, F, H, W] + + # No need to slice after CFG, because B_flat is already batch_size (not 2*batch_size) + # The CFG merging was done on logits before flattening, so latents_codes is already the correct size + + if i == len(self.scheduler.timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, timestep, latents_codes) + + # Final codes after denoising: [B, F', H', W'] + final_codes = latents_codes + + self.maybe_free_model_hooks() + + # Handle output based on output_type + if output_type == "latent": + # Return discrete code tensor [B, F', H', W'] + output = final_codes + # Verify output dtype and shape + assert output.dtype in (torch.long, torch.int64), \ + f"Expected latent output dtype torch.long or torch.int64, got {output.dtype}" + assert output.shape == (batch_size, F_prime, H_prime, W_prime), ( + f"Expected latent output shape {(batch_size, F_prime, H_prime, W_prime)}, " + f"got {output.shape}" + ) + else: + # Decode codes back to RGB video using Cosmos tokenizer + # IMPORTANT: Model uses extended vocab [0, codebook_size] where: + # - [0, codebook_size-1] = valid Cosmos codes (direct mapping) + # - codebook_size = mask_token_id + # Cosmos expects [0, codebook_size-1], so we filter out mask tokens + with torch.no_grad(): + # Map from model vocab [0, codebook_size] to Cosmos vocab [0, codebook_size-1] + # Clamp to [0, codebook_size-1] to filter out mask_token_id (codebook_size) + cosmos_codes = torch.clamp(final_codes, min=0, max=self.codebook_size - 1) + videos = self.video_tokenizer.decode(cosmos_codes) + + # Postprocess to standard video output format + # videos is in [0, 1] range as float tensor + if output_type == "np": + # Convert to numpy array (convert to float32 first, numpy doesn't support bfloat16) + videos_cpu = videos.cpu() + if videos_cpu.dtype != torch.float32: + videos_cpu = videos_cpu.to(torch.float32) + output = videos_cpu.numpy() + elif output_type == "pil": + # Convert to list of PIL Images (one list per batch, each containing frames) + import numpy as np + from PIL import Image + output = [] + videos_cpu = videos.cpu() + if videos_cpu.dtype != torch.float32: + videos_cpu = videos_cpu.to(torch.float32) + for b in range(batch_size): + video_frames = [] + for f in range(videos_cpu.shape[2]): # Loop over frames + frame = videos_cpu[b, :, f, :, :].numpy() # [C, H, W] + frame = np.transpose(frame, (1, 2, 0)) # [H, W, C] + frame = (frame * 255).astype(np.uint8) + video_frames.append(Image.fromarray(frame)) + output.append(video_frames) + else: + # output_type == "pt", keep as float tensor in [0, 1] + output = videos + + if not return_dict: + return (output,) + + return VideoPipelineOutput(videos=output) + + +def test_cosmos_tokenizer_shapes(): + """ + Independent test for CosmosVideoTokenizer encode/decode shape verification. + + Tests: + - Encode video tensor [B, C, F, H, W] -> codes [B, F', H', W'] + - Decode codes [B, F', H', W'] -> video tensor [B, C, F, H, W] + - Verify shape consistency and compression factors + """ + import torch + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + dtype = torch.float32 + + print("=" * 80) + print("[Test] CosmosVideoTokenizer encode/decode shape test") + print("=" * 80) + + try: + model_id = "Cosmos-1.0-Tokenizer-DV8x16x16" + print(f"Loading tokenizer: {model_id}") + + video_tokenizer = CosmosVideoTokenizer(model_id=model_id, device=device, dtype=dtype) + print(f"✓ Tokenizer loaded") + print(f" Codebook size: {video_tokenizer.codebook_size}") + print(f" Mask token ID: {video_tokenizer.mask_token_id}") + print(f" Compression: {video_tokenizer.t_downsample}x{video_tokenizer.h_downsample}x{video_tokenizer.w_downsample}") + + # Test encode/decode with video shape that aligns well with compression factors + # Use frame count that's a multiple of t_downsample to minimize rounding issues + # t_downsample=8, so use 16 frames (16/8=2 compressed frames) + # h_downsample=16, w_downsample=16, so use dimensions divisible by 16 + B, C, F, H, W = 1, 3, 16, 480, 848 + test_video = torch.rand(B, C, F, H, W, device=device, dtype=dtype) + print(f"\nInput video shape: {test_video.shape} [B, C, F, H, W]") + + # Encode + codes = video_tokenizer.encode(test_video) + assert codes.ndim == 4, f"Expected codes to be 4D [B, F', H', W'], got {codes.ndim}D" + assert codes.shape[0] == B, f"Batch size mismatch: {codes.shape[0]} != {B}" + + F_prime = codes.shape[1] + H_prime = codes.shape[2] + W_prime = codes.shape[3] + print(f"Encoded codes shape: {codes.shape} [B, F', H', W']") + + # Verify compression factors (allow small rounding errors for temporal dimension) + expected_F_prime = F // video_tokenizer.t_downsample + assert abs(F_prime - expected_F_prime) <= 1, \ + f"Frame compression mismatch: {F_prime} vs expected ~{expected_F_prime} (from {F} // {video_tokenizer.t_downsample})" + + expected_H_prime = H // video_tokenizer.h_downsample + assert H_prime == expected_H_prime, \ + f"Height compression mismatch: {H_prime} vs {expected_H_prime} (from {H} // {video_tokenizer.h_downsample})" + + expected_W_prime = W // video_tokenizer.w_downsample + assert W_prime == expected_W_prime, \ + f"Width compression mismatch: {W_prime} vs {expected_W_prime} (from {W} // {video_tokenizer.w_downsample})" + + # Decode + decoded = video_tokenizer.decode(codes) + decoded_B, decoded_C, decoded_F, decoded_H, decoded_W = decoded.shape + + # Verify decoded shape (allow small rounding errors for temporal dimension) + assert decoded_B == B, f"Decoded batch size mismatch: {decoded_B} != {B}" + assert decoded_C == C, f"Decoded channel mismatch: {decoded_C} != {C}" + assert decoded_H == H, f"Decoded height mismatch: {decoded_H} != {H}" + assert decoded_W == W, f"Decoded width mismatch: {decoded_W} != {W}" + + # Frame count may differ slightly due to tokenizer's temporal interpolation/padding + # Allow ±1 frame tolerance + assert abs(decoded_F - F) <= 1, \ + f"Decoded frame count mismatch: {decoded_F} vs {F} (allowed ±1 frame tolerance)" + + print(f"Decoded video shape: {decoded.shape} [B, C, F, H, W]") + if decoded_F != F: + print(f" Note: Frame count differs by {decoded_F - F} (expected {F}, got {decoded_F})") + print(f" This is acceptable due to tokenizer's temporal interpolation behavior") + + print(f"\n✓ All shape checks passed!") + print(f" Compression: {F}x{H}x{W} -> {F_prime}x{H_prime}x{W_prime} -> {decoded_F}x{H}x{W}") + print(f" Compression ratios: temporal={F_prime/F:.3f}, spatial={H_prime*W_prime/(H*W):.3f}") + return True + + except Exception as e: + print(f"\n✗ CosmosVideoTokenizer shape test failed: {e}") + import traceback + traceback.print_exc() + return False + + +def test_pipeline_forward_latent_only(pipe, device): + """ + Test pipeline forward pass with latent-only output (no decoding). + + This test verifies: + - Pipeline initialization and forward pass + - Shape consistency through the denoising loop + - Latent output format [B, F', H', W'] + - Token value ranges [0, codebook_size] + """ + print("\n" + "=" * 80) + print("[Test] Pipeline forward pass (latent-only output)") + print("=" * 80) + + try: + prompt = ["a test prompt"] + num_frames = 8 + height = 256 + width = 448 + num_inference_steps = 2 + + print(f"Test parameters:") + print(f" prompt: {prompt}") + print(f" num_frames: {num_frames}") + print(f" height: {height}, width: {width}") + print(f" num_inference_steps: {num_inference_steps}") + print(f" output_type: 'latent'") + + # Run pipeline with latent output + result = pipe( + prompt=prompt, + num_frames=num_frames, + height=height, + width=width, + num_inference_steps=num_inference_steps, + output_type="latent", + return_dict=True, + ) + + # Verify output shape and type + output = result.videos + assert isinstance(output, torch.Tensor), f"Expected torch.Tensor, got {type(output)}" + assert output.dtype in (torch.long, torch.int64), \ + f"Expected dtype torch.long or torch.int64, got {output.dtype}" + + # Calculate expected compressed dimensions + F_prime = num_frames // pipe.video_tokenizer.t_downsample + H_prime = height // pipe.video_tokenizer.h_downsample + W_prime = width // pipe.video_tokenizer.w_downsample + + expected_shape = (1, F_prime, H_prime, W_prime) + assert output.shape == expected_shape, \ + f"Expected output shape {expected_shape}, got {output.shape}" + + print(f"\n✓ Output shape verified: {output.shape} [B, F', H', W']") + + # Check token value ranges + min_val = output.min().item() + max_val = output.max().item() + codebook_size = pipe.video_tokenizer.codebook_size + + print(f"Token value range: [{min_val}, {max_val}]") + print(f"Codebook size: {codebook_size}") + + # Tokens should be in [0, codebook_size] (codebook_size is mask_token_id) + assert min_val >= 0, f"Token values should be >= 0, got min={min_val}" + assert max_val <= codebook_size, \ + f"Token values should be <= {codebook_size}, got max={max_val}" + + # Print sample tokens + sample_tokens = output[0, 0, :5, :5].cpu().numpy() + print(f"\nSample tokens (first 5x5 of first frame):") + print(sample_tokens) + + print(f"\n✓ All latent-only tests passed!") + return True + + except Exception as e: + print(f"\n✗ Pipeline forward test failed: {e}") + import traceback + traceback.print_exc() + return False + + +# if __name__ == "__main__": +# """ +# Comprehensive test for the video diffusion pipeline. + +# Test sequence: +# 1. test_cosmos_tokenizer_shapes() - Verify CosmosVideoTokenizer encode/decode +# 2. Build all pipeline components +# 3. test_pipeline_forward_latent_only() - Test pipeline forward pass with latent output +# 4. Full pipeline test with PIL output (optional, after latent test passes) +# """ +# import torch + +# # Set device and dtype +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# dtype = torch.float32 + +# print("=" * 80) +# print("Testing Video Diffusion Pipeline") +# print("=" * 80) + +# # Step 1: Test CosmosVideoTokenizer shapes +# if not test_cosmos_tokenizer_shapes(): +# print("\n✗ CosmosVideoTokenizer test failed. Exiting.") +# exit(1) + +# # Step 2: Build pipeline components +# print("\n" + "=" * 80) +# print("Building Pipeline Components") +# print("=" * 80) + +# try: +# model_id = "Cosmos-1.0-Tokenizer-DV8x16x16" +# print(f"\nLoading video_tokenizer: {model_id}") +# video_tokenizer = CosmosVideoTokenizer(model_id=model_id, device=device, dtype=dtype) +# print(f"✓ Video tokenizer loaded") + +# print(f"\nLoading T5 tokenizer and encoder...") +# from transformers import T5Tokenizer, T5EncoderModel +# tokenizer = T5Tokenizer.from_pretrained('google/umt5-base') +# text_encoder = T5EncoderModel.from_pretrained('google/umt5-base').to(device, dtype=dtype) +# print(f"✓ T5 components loaded") + +# print(f"\nInitializing Scheduler...") +# from src.scheduler_video import Scheduler +# # mask_token_id = codebook_size (outside valid code range [0, codebook_size-1]) +# scheduler = Scheduler( +# mask_token_id=video_tokenizer.mask_token_id, # = codebook_size +# masking_schedule="cosine" +# ) +# print(f"✓ Scheduler initialized") +# print(f" Scheduler mask_token_id: {scheduler.config.mask_token_id}") +# print(f" Video tokenizer codebook_size: {video_tokenizer.codebook_size}") + +# print(f"\nLoading transformer...") +# from src.transformer_video import WanDiscreteVideoTransformer + +# # Calculate compressed dimensions for transformer +# num_frames = 8 +# height = 256 +# width = 448 +# F_prime = num_frames // video_tokenizer.t_downsample +# H_prime = height // video_tokenizer.h_downsample +# W_prime = width // video_tokenizer.w_downsample + +# # Get actual text encoder output dimension +# # UMT5-base outputs 768 dimensions, not 4096 +# text_dim_actual = text_encoder.config.d_model +# print(f" Text encoder output dimension: {text_dim_actual}") + +# transformer = WanDiscreteVideoTransformer( +# codebook_size=video_tokenizer.codebook_size, +# vocab_size=video_tokenizer.codebook_size + 1, +# num_frames=F_prime, +# height=H_prime, +# width=W_prime, +# model_type='t2v', +# patch_size=(1, 2, 2), +# text_len=512, +# in_dim=16, +# dim=2048, +# ffn_dim=8192, +# freq_dim=256, +# text_dim=text_dim_actual, # Use actual text encoder dimension (768 for UMT5-base) +# out_dim=16, +# num_heads=16, +# num_layers=32, +# window_size=(-1, -1), +# qk_norm=True, +# cross_attn_norm=True, +# eps=1e-6 +# ).to(device, dtype=dtype) +# print(f"✓ Transformer initialized") + +# print(f"\nInitializing Pipeline...") +# pipe = Pipeline( +# tokenizer=tokenizer, +# text_encoder=text_encoder, +# transformer=transformer, +# scheduler=scheduler, +# video_tokenizer=video_tokenizer, +# text_len=512, +# num_frames=num_frames, +# height=height, +# width=width, +# ).to(device) +# print(f"✓ Pipeline initialized") + +# except Exception as e: +# print(f"\n✗ Failed to build pipeline components: {e}") +# import traceback +# traceback.print_exc() +# exit(1) + +# # Step 3: Test pipeline forward pass with latent-only output +# if not test_pipeline_forward_latent_only(pipe, device): +# print("\n✗ Pipeline forward test failed. Exiting.") +# exit(1) + +# # Step 4: Optional - Full pipeline test with PIL output +# # (Uncomment when latent-only test passes) +# # print("\n" + "=" * 80) +# # print("[Test] Full pipeline test with PIL output") +# # print("=" * 80) +# # try: +# # result = pipe( +# # prompt="a test video", +# # num_frames=num_frames, +# # height=height, +# # width=width, +# # num_inference_steps=2, +# # output_type="pil", +# # return_dict=True, +# # ) +# # print(f"✓ Full pipeline test passed!") +# # except Exception as e: +# # print(f"✗ Full pipeline test failed: {e}") +# # import traceback +# # traceback.print_exc() + +# print("\n" + "=" * 80) +# print("All tests passed successfully!") +# print("=" * 80) \ No newline at end of file diff --git a/Meissonic/src/scheduler.py b/Meissonic/src/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..3d2fe4276351ffb0ec5883c59c4985a5ca9bd859 --- /dev/null +++ b/Meissonic/src/scheduler.py @@ -0,0 +1,175 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput +from diffusers.schedulers.scheduling_utils import SchedulerMixin + + +def gumbel_noise(t, generator=None): + device = generator.device if generator is not None else t.device + noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device) + return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20)) + + +def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None): + confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator) + sorted_confidence = torch.sort(confidence, dim=-1).values + cut_off = torch.gather(sorted_confidence, 1, mask_len.long()) + masking = confidence < cut_off + return masking + + +@dataclass +class SchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample `(x_{0})` based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor = None + + +class Scheduler(SchedulerMixin, ConfigMixin): + order = 1 + + temperatures: torch.Tensor + + @register_to_config + def __init__( + self, + mask_token_id: int, + masking_schedule: str = "cosine", + ): + self.temperatures = None + self.timesteps = None + + def set_timesteps( + self, + num_inference_steps: int, + temperature: Union[int, Tuple[int, int], List[int]] = (2, 0), + device: Union[str, torch.device] = None, + ): + self.timesteps = torch.arange(num_inference_steps, device=device).flip(0) + + if isinstance(temperature, (tuple, list)): + self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device) + else: + self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device) + + def step( + self, + model_output: torch.Tensor, + timestep: torch.long, + sample: torch.LongTensor, + starting_mask_ratio: int = 1, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: + two_dim_input = sample.ndim == 3 and model_output.ndim == 4 + + if two_dim_input: + batch_size, codebook_size, height, width = model_output.shape + sample = sample.reshape(batch_size, height * width) + model_output = model_output.reshape(batch_size, codebook_size, height * width).permute(0, 2, 1) + + unknown_map = sample == self.config.mask_token_id + + probs = model_output.softmax(dim=-1) + + device = probs.device + probs_ = probs.to(generator.device) if generator is not None else probs # handles when generator is on CPU + if probs_.device.type == "cpu" and probs_.dtype != torch.float32: + probs_ = probs_.float() # multinomial is not implemented for cpu half precision + probs_ = probs_.reshape(-1, probs.size(-1)) + pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device) + pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1]) + pred_original_sample = torch.where(unknown_map, pred_original_sample, sample) + + if timestep == 0: + prev_sample = pred_original_sample + else: + seq_len = sample.shape[1] + step_idx = (self.timesteps == timestep).nonzero() + ratio = (step_idx + 1) / len(self.timesteps) + + if self.config.masking_schedule == "cosine": + mask_ratio = torch.cos(ratio * math.pi / 2) + elif self.config.masking_schedule == "linear": + mask_ratio = 1 - ratio + else: + raise ValueError(f"unknown masking schedule {self.config.masking_schedule}") + + mask_ratio = starting_mask_ratio * mask_ratio + + mask_len = (seq_len * mask_ratio).floor() + # do not mask more than amount previously masked + mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len) + # mask at least one + mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len) + + selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0] + # Ignores the tokens given in the input by overwriting their confidence. + selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max) + + masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator) + + # Masks tokens with lower confidence. + prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample) + + if two_dim_input: + prev_sample = prev_sample.reshape(batch_size, height, width) + pred_original_sample = pred_original_sample.reshape(batch_size, height, width) + + if not return_dict: + return (prev_sample, pred_original_sample) + + return SchedulerOutput(prev_sample, pred_original_sample) + + def add_noise(self, sample, timesteps, generator=None): + step_idx = (self.timesteps == timesteps).nonzero() + ratio = (step_idx + 1) / len(self.timesteps) + + if self.config.masking_schedule == "cosine": + mask_ratio = torch.cos(ratio * math.pi / 2) + elif self.config.masking_schedule == "linear": + mask_ratio = 1 - ratio + else: + raise ValueError(f"unknown masking schedule {self.config.masking_schedule}") + + mask_indices = ( + torch.rand( + sample.shape, device=generator.device if generator is not None else sample.device, generator=generator + ).to(sample.device) + < mask_ratio + ) + + masked_sample = sample.clone() + + masked_sample[mask_indices] = self.config.mask_token_id + + return masked_sample diff --git a/Meissonic/src/scheduler_video.py b/Meissonic/src/scheduler_video.py new file mode 100644 index 0000000000000000000000000000000000000000..a3e02c7d9fe23349b1c1deca7d85018a9467713f --- /dev/null +++ b/Meissonic/src/scheduler_video.py @@ -0,0 +1,188 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput +from diffusers.schedulers.scheduling_utils import SchedulerMixin + + +def gumbel_noise(t, generator=None): + device = generator.device if generator is not None else t.device + noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device) + return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20)) + + +def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None): + confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator) + sorted_confidence = torch.sort(confidence, dim=-1).values + cut_off = torch.gather(sorted_confidence, 1, mask_len.long()) + masking = confidence < cut_off + return masking + + +@dataclass +class SchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample `(x_{0})` based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor = None + + +class Scheduler(SchedulerMixin, ConfigMixin): + order = 1 + + temperatures: torch.Tensor + + @register_to_config + def __init__( + self, + mask_token_id: int, + masking_schedule: str = "cosine", + ): + self.temperatures = None + self.timesteps = None + + def set_timesteps( + self, + num_inference_steps: int, + temperature: Union[int, Tuple[int, int], List[int]] = (2, 0), + device: Union[str, torch.device] = None, + ): + self.timesteps = torch.arange(num_inference_steps, device=device).flip(0) + + if isinstance(temperature, (tuple, list)): + self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device) + else: + self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device) + + def step( + self, + model_output: torch.Tensor, + timestep: torch.long, + sample: torch.LongTensor, + starting_mask_ratio: int = 1, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: + # Handle different input shapes: 1D, 2D (image), or 3D (video) + # All are flattened to 1D token sequences [B, N] for unified processing + two_dim_input = sample.ndim == 3 and model_output.ndim == 4 # [B, H, W] & [B, vocab, H, W] + three_dim_input = sample.ndim == 4 and model_output.ndim == 5 # [B, F, H, W] & [B, vocab, F, H, W] + + if two_dim_input: + # Image case: [B, H, W] -> [B, H*W] + batch_size, vocab_size, height, width = model_output.shape + sample = sample.reshape(batch_size, height * width) + model_output = model_output.reshape(batch_size, vocab_size, height * width).permute(0, 2, 1) + elif three_dim_input: + # Video case: [B, F, H, W] -> [B, F*H*W] + batch_size, vocab_size, num_frames, height, width = model_output.shape + sample = sample.reshape(batch_size, num_frames * height * width) + model_output = model_output.reshape(batch_size, vocab_size, num_frames * height * width).permute(0, 2, 1) + + unknown_map = sample == self.config.mask_token_id + + probs = model_output.softmax(dim=-1) + + device = probs.device + probs_ = probs.to(generator.device) if generator is not None else probs # handles when generator is on CPU + if probs_.device.type == "cpu" and probs_.dtype != torch.float32: + probs_ = probs_.float() # multinomial is not implemented for cpu half precision + probs_ = probs_.reshape(-1, probs.size(-1)) + pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device) + pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1]) + pred_original_sample = torch.where(unknown_map, pred_original_sample, sample) + + if timestep == 0: + prev_sample = pred_original_sample + else: + seq_len = sample.shape[1] + step_idx = (self.timesteps == timestep).nonzero() + ratio = (step_idx + 1) / len(self.timesteps) + + if self.config.masking_schedule == "cosine": + mask_ratio = torch.cos(ratio * math.pi / 2) + elif self.config.masking_schedule == "linear": + mask_ratio = 1 - ratio + else: + raise ValueError(f"unknown masking schedule {self.config.masking_schedule}") + + mask_ratio = starting_mask_ratio * mask_ratio + + mask_len = (seq_len * mask_ratio).floor() + # do not mask more than amount previously masked + mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len) + # mask at least one + mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len) + + selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0] + # Ignores the tokens given in the input by overwriting their confidence. + selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max) + + masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator) + + # Masks tokens with lower confidence. + prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample) + + # Reshape back to original dimensions + if two_dim_input: + prev_sample = prev_sample.reshape(batch_size, height, width) + pred_original_sample = pred_original_sample.reshape(batch_size, height, width) + elif three_dim_input: + prev_sample = prev_sample.reshape(batch_size, num_frames, height, width) + pred_original_sample = pred_original_sample.reshape(batch_size, num_frames, height, width) + + if not return_dict: + return (prev_sample, pred_original_sample) + + return SchedulerOutput(prev_sample, pred_original_sample) + + def add_noise(self, sample, timesteps, generator=None): + step_idx = (self.timesteps == timesteps).nonzero() + ratio = (step_idx + 1) / len(self.timesteps) + + if self.config.masking_schedule == "cosine": + mask_ratio = torch.cos(ratio * math.pi / 2) + elif self.config.masking_schedule == "linear": + mask_ratio = 1 - ratio + else: + raise ValueError(f"unknown masking schedule {self.config.masking_schedule}") + + mask_indices = ( + torch.rand( + sample.shape, device=generator.device if generator is not None else sample.device, generator=generator + ).to(sample.device) + < mask_ratio + ) + + masked_sample = sample.clone() + + masked_sample[mask_indices] = self.config.mask_token_id + + return masked_sample diff --git a/Meissonic/src/transformer.py b/Meissonic/src/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..fbce4e73ae35ea5f3e21084456bef097b3f76c83 --- /dev/null +++ b/Meissonic/src/transformer.py @@ -0,0 +1,1116 @@ +# Copyright 2024 Black Forest Labs, The HuggingFace Team, The InstantX Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Any, Dict, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin +from diffusers.models.attention import FeedForward, BasicTransformerBlock, SkipFFTransformerBlock +from diffusers.models.attention_processor import ( + Attention, + AttentionProcessor, + FluxAttnProcessor2_0, + # FusedFluxAttnProcessor2_0, +) +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle, GlobalResponseNorm, RMSNorm +from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers +from diffusers.utils.torch_utils import maybe_allow_in_graph +from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings,TimestepEmbedding, get_timestep_embedding #,FluxPosEmbed +from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.models.resnet import Downsample2D, Upsample2D + +from typing import List + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + + +def get_3d_rotary_pos_embed( + embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + RoPE for video tokens with 3D structure. + + Args: + embed_dim: (`int`): + The embedding dimension size, corresponding to hidden_size_head. + crops_coords (`Tuple[int]`): + The top-left and bottom-right coordinates of the crop. + grid_size (`Tuple[int]`): + The grid size of the spatial positional embedding (height, width). + temporal_size (`int`): + The size of the temporal dimension. + theta (`float`): + Scaling factor for frequency computation. + use_real (`bool`): + If True, return real part and imaginary part separately. Otherwise, return complex numbers. + + Returns: + `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`. + """ + start, stop = crops_coords + grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32) + grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32) + grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32) + + # Compute dimensions for each axis + dim_t = embed_dim // 4 + dim_h = embed_dim // 8 * 3 + dim_w = embed_dim // 8 * 3 + + # Temporal frequencies + freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t)) + grid_t = torch.from_numpy(grid_t).float() + freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t) + freqs_t = freqs_t.repeat_interleave(2, dim=-1) + + # Spatial frequencies for height and width + freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h)) + freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w)) + grid_h = torch.from_numpy(grid_h).float() + grid_w = torch.from_numpy(grid_w).float() + freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h) + freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w) + freqs_h = freqs_h.repeat_interleave(2, dim=-1) + freqs_w = freqs_w.repeat_interleave(2, dim=-1) + + # Broadcast and concatenate tensors along specified dimension + def broadcast(tensors, dim=-1): + num_tensors = len(tensors) + shape_lens = {len(t.shape) for t in tensors} + assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" + shape_len = list(shape_lens)[0] + dim = (dim + shape_len) if dim < 0 else dim + dims = list(zip(*(list(t.shape) for t in tensors))) + expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] + assert all( + [*(len(set(t[1])) <= 2 for t in expandable_dims)] + ), "invalid dimensions for broadcastable concatenation" + max_dims = [(t[0], max(t[1])) for t in expandable_dims] + expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims] + expanded_dims.insert(dim, (dim, dims[dim])) + expandable_shapes = list(zip(*(t[1] for t in expanded_dims))) + tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)] + return torch.cat(tensors, dim=dim) + + freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1) + + t, h, w, d = freqs.shape + freqs = freqs.view(t * h * w, d) + + # Generate sine and cosine components + sin = freqs.sin() + cos = freqs.cos() + + if use_real: + return cos, sin + else: + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) + return freqs_cis + + +def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True): + """ + RoPE for image tokens with 2d structure. + + Args: + embed_dim: (`int`): + The embedding dimension size + crops_coords (`Tuple[int]`) + The top-left and bottom-right coordinates of the crop. + grid_size (`Tuple[int]`): + The grid size of the positional embedding. + use_real (`bool`): + If True, return real part and imaginary part separately. Otherwise, return complex numbers. + + Returns: + `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`. + """ + start, stop = crops_coords + grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32) + grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) # [2, W, H] + + grid = grid.reshape([2, 1, *grid.shape[1:]]) + pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real) + return pos_embed + + +def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False): + assert embed_dim % 4 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_rotary_pos_embed( + embed_dim // 2, grid[0].reshape(-1), use_real=use_real + ) # (H*W, D/2) if use_real else (H*W, D/4) + emb_w = get_1d_rotary_pos_embed( + embed_dim // 2, grid[1].reshape(-1), use_real=use_real + ) # (H*W, D/2) if use_real else (H*W, D/4) + + if use_real: + cos = torch.cat([emb_h[0], emb_w[0]], dim=1) # (H*W, D) + sin = torch.cat([emb_h[1], emb_w[1]], dim=1) # (H*W, D) + return cos, sin + else: + emb = torch.cat([emb_h, emb_w], dim=1) # (H*W, D/2) + return emb + + +def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_factor=1.0, ntk_factor=1.0): + assert embed_dim % 4 == 0 + + emb_h = get_1d_rotary_pos_embed( + embed_dim // 2, len_h, linear_factor=linear_factor, ntk_factor=ntk_factor + ) # (H, D/4) + emb_w = get_1d_rotary_pos_embed( + embed_dim // 2, len_w, linear_factor=linear_factor, ntk_factor=ntk_factor + ) # (W, D/4) + emb_h = emb_h.view(len_h, 1, embed_dim // 4, 1).repeat(1, len_w, 1, 1) # (H, W, D/4, 1) + emb_w = emb_w.view(1, len_w, embed_dim // 4, 1).repeat(len_h, 1, 1, 1) # (H, W, D/4, 1) + + emb = torch.cat([emb_h, emb_w], dim=-1).flatten(2) # (H, W, D/2) + return emb + + +def get_1d_rotary_pos_embed( + dim: int, + pos: Union[np.ndarray, int], + theta: float = 10000.0, + use_real=False, + linear_factor=1.0, + ntk_factor=1.0, + repeat_interleave_real=True, + freqs_dtype=torch.float32, # torch.float32 (hunyuan, stable audio), torch.float64 (flux) +): + """ + Precompute the frequency tensor for complex exponentials (cis) with given dimensions. + + This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end + index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64 + data type. + + Args: + dim (`int`): Dimension of the frequency tensor. + pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar + theta (`float`, *optional*, defaults to 10000.0): + Scaling factor for frequency computation. Defaults to 10000.0. + use_real (`bool`, *optional*): + If True, return real part and imaginary part separately. Otherwise, return complex numbers. + linear_factor (`float`, *optional*, defaults to 1.0): + Scaling factor for the context extrapolation. Defaults to 1.0. + ntk_factor (`float`, *optional*, defaults to 1.0): + Scaling factor for the NTK-Aware RoPE. Defaults to 1.0. + repeat_interleave_real (`bool`, *optional*, defaults to `True`): + If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`. + Otherwise, they are concateanted with themselves. + freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`): + the dtype of the frequency tensor. + Returns: + `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2] + """ + assert dim % 2 == 0 + + if isinstance(pos, int): + pos = np.arange(pos) + theta = theta * ntk_factor + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor # [D/2] + t = torch.from_numpy(pos).to(freqs.device) # type: ignore # [S] + freqs = torch.outer(t, freqs) # type: ignore # [S, D/2] + if use_real and repeat_interleave_real: + freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D] + freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D] + return freqs_cos, freqs_sin + elif use_real: + freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float() # [S, D] + freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float() # [S, D] + return freqs_cos, freqs_sin + else: + freqs_cis = torch.polar(torch.ones_like(freqs), freqs).float() # complex64 # [S, D/2] + return freqs_cis + + +class FluxPosEmbed(nn.Module): + # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11 + def __init__(self, theta: int, axes_dim: List[int]): + super().__init__() + self.theta = theta + self.axes_dim = axes_dim + + def forward(self, ids: torch.Tensor) -> torch.Tensor: + n_axes = ids.shape[-1] + cos_out = [] + sin_out = [] + pos = ids.squeeze().float().cpu().numpy() + is_mps = ids.device.type == "mps" + freqs_dtype = torch.float32 if is_mps else torch.float64 + for i in range(n_axes): + cos, sin = get_1d_rotary_pos_embed( + self.axes_dim[i], pos[:, i], repeat_interleave_real=True, use_real=True, freqs_dtype=freqs_dtype + ) + cos_out.append(cos) + sin_out.append(sin) + freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device) + freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device) + return freqs_cos, freqs_sin + + + +class FusedFluxAttnProcessor2_0: + """Attention processor used typically in processing the SD3-like self-attention projections.""" + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError( + "FusedFluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0." + ) + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + + # `sample` projections. + qkv = attn.to_qkv(hidden_states) + split_size = qkv.shape[-1] // 3 + query, key, value = torch.split(qkv, split_size, dim=-1) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states` + # `context` projections. + if encoder_hidden_states is not None: + encoder_qkv = attn.to_added_qkv(encoder_hidden_states) + split_size = encoder_qkv.shape[-1] // 3 + ( + encoder_hidden_states_query_proj, + encoder_hidden_states_key_proj, + encoder_hidden_states_value_proj, + ) = torch.split(encoder_qkv, split_size, dim=-1) + + encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + + if attn.norm_added_q is not None: + encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj) + if attn.norm_added_k is not None: + encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj) + + # attention + query = torch.cat([encoder_hidden_states_query_proj, query], dim=2) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) + + if image_rotary_emb is not None: + from diffusers.models.embeddings import apply_rotary_emb + + query = apply_rotary_emb(query, image_rotary_emb) + key = apply_rotary_emb(key, image_rotary_emb) + + hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + if encoder_hidden_states is not None: + encoder_hidden_states, hidden_states = ( + hidden_states[:, : encoder_hidden_states.shape[1]], + hidden_states[:, encoder_hidden_states.shape[1] :], + ) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + encoder_hidden_states = attn.to_add_out(encoder_hidden_states) + + return hidden_states, encoder_hidden_states + else: + return hidden_states + + + +@maybe_allow_in_graph +class SingleTransformerBlock(nn.Module): + r""" + A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3. + + Reference: https://arxiv.org/abs/2403.03206 + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the + processing of `context` conditions. + """ + + def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0): + super().__init__() + self.mlp_hidden_dim = int(dim * mlp_ratio) + + self.norm = AdaLayerNormZeroSingle(dim) + self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim) + self.act_mlp = nn.GELU(approximate="tanh") + self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim) + + processor = FluxAttnProcessor2_0() + self.attn = Attention( + query_dim=dim, + cross_attention_dim=None, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=True, + processor=processor, + qk_norm="rms_norm", + eps=1e-6, + pre_only=True, + ) + + def forward( + self, + hidden_states: torch.FloatTensor, + temb: torch.FloatTensor, + image_rotary_emb=None, + ): + residual = hidden_states + norm_hidden_states, gate = self.norm(hidden_states, emb=temb) + mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states)) + + attn_output = self.attn( + hidden_states=norm_hidden_states, + image_rotary_emb=image_rotary_emb, + ) + + hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2) + gate = gate.unsqueeze(1) + hidden_states = gate * self.proj_out(hidden_states) + hidden_states = residual + hidden_states + if hidden_states.dtype == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) + + return hidden_states + +@maybe_allow_in_graph +class TransformerBlock(nn.Module): + r""" + A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3. + + Reference: https://arxiv.org/abs/2403.03206 + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the + processing of `context` conditions. + """ + + def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6): + super().__init__() + + self.norm1 = AdaLayerNormZero(dim) + + self.norm1_context = AdaLayerNormZero(dim) + + if hasattr(F, "scaled_dot_product_attention"): + processor = FluxAttnProcessor2_0() + else: + raise ValueError( + "The current PyTorch version does not support the `scaled_dot_product_attention` function." + ) + self.attn = Attention( + query_dim=dim, + cross_attention_dim=None, + added_kv_proj_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + context_pre_only=False, + bias=True, + processor=processor, + qk_norm=qk_norm, + eps=eps, + ) + + self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6) + self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + # self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="swiglu") + + self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6) + self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + # self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="swiglu") + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + temb: torch.FloatTensor, + image_rotary_emb=None, + ): + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb) + + norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context( + encoder_hidden_states, emb=temb + ) + # Attention. + attn_output, context_attn_output = self.attn( + hidden_states=norm_hidden_states, + encoder_hidden_states=norm_encoder_hidden_states, + image_rotary_emb=image_rotary_emb, + ) + + # Process attention outputs for the `hidden_states`. + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = hidden_states + attn_output + + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + ff_output = self.ff(norm_hidden_states) + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = hidden_states + ff_output + + # Process attention outputs for the `encoder_hidden_states`. + + context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output + encoder_hidden_states = encoder_hidden_states + context_attn_output + + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) + norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None] + + context_ff_output = self.ff_context(norm_encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output + if encoder_hidden_states.dtype == torch.float16: + encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) + + return encoder_hidden_states, hidden_states + + +class UVit2DConvEmbed(nn.Module): + def __init__(self, in_channels, block_out_channels, vocab_size, elementwise_affine, eps, bias): + super().__init__() + self.embeddings = nn.Embedding(vocab_size, in_channels) + self.layer_norm = RMSNorm(in_channels, eps, elementwise_affine) + self.conv = nn.Conv2d(in_channels, block_out_channels, kernel_size=1, bias=bias) + + def forward(self, input_ids): + embeddings = self.embeddings(input_ids) + embeddings = self.layer_norm(embeddings) + embeddings = embeddings.permute(0, 3, 1, 2) + embeddings = self.conv(embeddings) + return embeddings + +class ConvMlmLayer(nn.Module): + def __init__( + self, + block_out_channels: int, + in_channels: int, + use_bias: bool, + ln_elementwise_affine: bool, + layer_norm_eps: float, + codebook_size: int, + ): + super().__init__() + self.conv1 = nn.Conv2d(block_out_channels, in_channels, kernel_size=1, bias=use_bias) + self.layer_norm = RMSNorm(in_channels, layer_norm_eps, ln_elementwise_affine) + self.conv2 = nn.Conv2d(in_channels, codebook_size, kernel_size=1, bias=use_bias) + + def forward(self, hidden_states): + hidden_states = self.conv1(hidden_states) + hidden_states = self.layer_norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + logits = self.conv2(hidden_states) + return logits + +class SwiGLU(nn.Module): + r""" + A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU` + but uses SiLU / Swish instead of GeLU. + + Parameters: + dim_in (`int`): The number of channels in the input. + dim_out (`int`): The number of channels in the output. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__(self, dim_in: int, dim_out: int, bias: bool = True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias) + self.activation = nn.SiLU() + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states, gate = hidden_states.chunk(2, dim=-1) + return hidden_states * self.activation(gate) + +class ConvNextBlock(nn.Module): + def __init__( + self, channels, layer_norm_eps, ln_elementwise_affine, use_bias, hidden_dropout, hidden_size, res_ffn_factor=4 + ): + super().__init__() + self.depthwise = nn.Conv2d( + channels, + channels, + kernel_size=3, + padding=1, + groups=channels, + bias=use_bias, + ) + self.norm = RMSNorm(channels, layer_norm_eps, ln_elementwise_affine) + self.channelwise_linear_1 = nn.Linear(channels, int(channels * res_ffn_factor), bias=use_bias) + self.channelwise_act = nn.GELU() + self.channelwise_norm = GlobalResponseNorm(int(channels * res_ffn_factor)) + self.channelwise_linear_2 = nn.Linear(int(channels * res_ffn_factor), channels, bias=use_bias) + self.channelwise_dropout = nn.Dropout(hidden_dropout) + self.cond_embeds_mapper = nn.Linear(hidden_size, channels * 2, use_bias) + + def forward(self, x, cond_embeds): + x_res = x + + x = self.depthwise(x) + + x = x.permute(0, 2, 3, 1) + x = self.norm(x) + + x = self.channelwise_linear_1(x) + x = self.channelwise_act(x) + x = self.channelwise_norm(x) + x = self.channelwise_linear_2(x) + x = self.channelwise_dropout(x) + + x = x.permute(0, 3, 1, 2) + + x = x + x_res + + scale, shift = self.cond_embeds_mapper(F.silu(cond_embeds)).chunk(2, dim=1) + x = x * (1 + scale[:, :, None, None]) + shift[:, :, None, None] + + return x + +class Simple_UVitBlock(nn.Module): + def __init__( + self, + channels, + ln_elementwise_affine, + layer_norm_eps, + use_bias, + downsample: bool, + upsample: bool, + ): + super().__init__() + + if downsample: + self.downsample = Downsample2D( + channels, + use_conv=True, + padding=0, + name="Conv2d_0", + kernel_size=2, + norm_type="rms_norm", + eps=layer_norm_eps, + elementwise_affine=ln_elementwise_affine, + bias=use_bias, + ) + else: + self.downsample = None + + if upsample: + self.upsample = Upsample2D( + channels, + use_conv_transpose=True, + kernel_size=2, + padding=0, + name="conv", + norm_type="rms_norm", + eps=layer_norm_eps, + elementwise_affine=ln_elementwise_affine, + bias=use_bias, + interpolate=False, + ) + else: + self.upsample = None + + def forward(self, x): + # print("before,", x.shape) + if self.downsample is not None: + # print('downsample') + x = self.downsample(x) + + if self.upsample is not None: + # print('upsample') + x = self.upsample(x) + # print("after,", x.shape) + return x + +class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): + """ + The Transformer model introduced in Flux. + + Reference: https://blackforestlabs.ai/announcing-black-forest-labs/ + + Parameters: + patch_size (`int`): Patch size to turn the input data into small patches. + in_channels (`int`, *optional*, defaults to 16): The number of channels in the input. + num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use. + num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use. + attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head. + num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention. + joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. + pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`. + guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings. + """ + + _supports_gradient_checkpointing = False #True + # Due to NotImplementedError: DDPOptimizer backend: Found a higher order op in the graph. This is not supported. Please turn off DDP optimizer using torch._dynamo.config.optimize_ddp=False. Note that this can cause performance degradation because there will be one bucket for the entire Dynamo graph. + # Please refer to this issue - https://github.com/pytorch/pytorch/issues/104674. + _no_split_modules = ["TransformerBlock", "SingleTransformerBlock"] + + @register_to_config + def __init__( + self, + patch_size: int = 1, + in_channels: int = 64, + num_layers: int = 19, + num_single_layers: int = 38, + attention_head_dim: int = 128, + num_attention_heads: int = 24, + joint_attention_dim: int = 4096, + pooled_projection_dim: int = 768, + guidance_embeds: bool = False, # unused in our implementation + axes_dims_rope: Tuple[int] = (16, 56, 56), + vocab_size: int = 8256, + codebook_size: int = 8192, + downsample: bool = False, + upsample: bool = False, + ): + super().__init__() + self.out_channels = in_channels + self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim + + self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope) + text_time_guidance_cls = ( + CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings + ) + self.time_text_embed = text_time_guidance_cls( + embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim + ) + + self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim) + + self.transformer_blocks = nn.ModuleList( + [ + TransformerBlock( + dim=self.inner_dim, + num_attention_heads=self.config.num_attention_heads, + attention_head_dim=self.config.attention_head_dim, + ) + for i in range(self.config.num_layers) + ] + ) + + self.single_transformer_blocks = nn.ModuleList( + [ + SingleTransformerBlock( + dim=self.inner_dim, + num_attention_heads=self.config.num_attention_heads, + attention_head_dim=self.config.attention_head_dim, + ) + for i in range(self.config.num_single_layers) + ] + ) + + + self.gradient_checkpointing = False + + in_channels_embed = self.inner_dim + ln_elementwise_affine = True + layer_norm_eps = 1e-06 + use_bias = False + micro_cond_embed_dim = 1280 + self.embed = UVit2DConvEmbed( + in_channels_embed, self.inner_dim, self.config.vocab_size, ln_elementwise_affine, layer_norm_eps, use_bias + ) + self.mlm_layer = ConvMlmLayer( + self.inner_dim, in_channels_embed, use_bias, ln_elementwise_affine, layer_norm_eps, self.config.codebook_size + ) + self.cond_embed = TimestepEmbedding( + micro_cond_embed_dim + self.config.pooled_projection_dim, self.inner_dim, sample_proj_bias=use_bias + ) + self.encoder_proj_layer_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine) + self.project_to_hidden_norm = RMSNorm(in_channels_embed, layer_norm_eps, ln_elementwise_affine) + self.project_to_hidden = nn.Linear(in_channels_embed, self.inner_dim, bias=use_bias) + self.project_from_hidden_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine) + self.project_from_hidden = nn.Linear(self.inner_dim, in_channels_embed, bias=use_bias) + + self.down_block = Simple_UVitBlock( + self.inner_dim, + ln_elementwise_affine, + layer_norm_eps, + use_bias, + downsample, + False, + ) + self.up_block = Simple_UVitBlock( + self.inner_dim, #block_out_channels, + ln_elementwise_affine, + layer_norm_eps, + use_bias, + False, + upsample=upsample, + ) + + # self.fuse_qkv_projections() + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor() + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0 + def fuse_qkv_projections(self): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + """ + self.original_attn_processors = None + + for _, attn_processor in self.attn_processors.items(): + if "Added" in str(attn_processor.__class__.__name__): + raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.") + + self.original_attn_processors = self.attn_processors + + for module in self.modules(): + if isinstance(module, Attention): + module.fuse_projections(fuse=True) + + self.set_attn_processor(FusedFluxAttnProcessor2_0()) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections + def unfuse_qkv_projections(self): + """Disables the fused QKV projection if enabled. + + + + This API is 🧪 experimental. + + + + """ + if self.original_attn_processors is not None: + self.set_attn_processor(self.original_attn_processors) + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor = None, + pooled_projections: torch.Tensor = None, + timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_block_samples= None, + controlnet_single_block_samples=None, + return_dict: bool = True, + micro_conds: torch.Tensor = None, + ) -> Union[torch.FloatTensor, Transformer2DModelOutput]: + """ + The [`FluxTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): + Input `hidden_states`. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected + from the embeddings of input conditions. + timestep ( `torch.LongTensor`): + Used to indicate denoising step. + block_controlnet_hidden_states: (`list` of `torch.Tensor`): + A list of tensors that if specified are added to the residuals of transformer blocks. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ + micro_cond_encode_dim = 256 # same as self.config.micro_cond_encode_dim = 256 from amused + micro_cond_embeds = get_timestep_embedding( + micro_conds.flatten(), micro_cond_encode_dim, flip_sin_to_cos=True, downscale_freq_shift=0 + ) + micro_cond_embeds = micro_cond_embeds.reshape((hidden_states.shape[0], -1)) + + pooled_projections = torch.cat([pooled_projections, micro_cond_embeds], dim=1) + pooled_projections = pooled_projections.to(dtype=self.dtype) + pooled_projections = self.cond_embed(pooled_projections).to(encoder_hidden_states.dtype) + + + hidden_states = self.embed(hidden_states) + + encoder_hidden_states = self.context_embedder(encoder_hidden_states) + encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states) + hidden_states = self.down_block(hidden_states) + + batch_size, channels, height, width = hidden_states.shape + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels) + hidden_states = self.project_to_hidden_norm(hidden_states) + hidden_states = self.project_to_hidden(hidden_states) + + + if joint_attention_kwargs is not None: + joint_attention_kwargs = joint_attention_kwargs.copy() + lora_scale = joint_attention_kwargs.pop("scale", 1.0) + else: + lora_scale = 1.0 + + if USE_PEFT_BACKEND: + # weight the lora layers by setting `lora_scale` for each PEFT layer + scale_lora_layers(self, lora_scale) + else: + if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None: + logger.warning( + "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective." + ) + + timestep = timestep.to(hidden_states.dtype) * 1000 + if guidance is not None: + guidance = guidance.to(hidden_states.dtype) * 1000 + else: + guidance = None + temb = ( + self.time_text_embed(timestep, pooled_projections) + if guidance is None + else self.time_text_embed(timestep, guidance, pooled_projections) + ) + + if txt_ids.ndim == 3: + logger.warning( + "Passing `txt_ids` 3d torch.Tensor is deprecated." + "Please remove the batch dimension and pass it as a 2d torch Tensor" + ) + txt_ids = txt_ids[0] + if img_ids.ndim == 3: + logger.warning( + "Passing `img_ids` 3d torch.Tensor is deprecated." + "Please remove the batch dimension and pass it as a 2d torch Tensor" + ) + img_ids = img_ids[0] + ids = torch.cat((txt_ids, img_ids), dim=0) + + image_rotary_emb = self.pos_embed(ids) + + for index_block, block in enumerate(self.transformer_blocks): + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + encoder_hidden_states, + temb, + image_rotary_emb, + **ckpt_kwargs, + ) + + else: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) + + + # controlnet residual + if controlnet_block_samples is not None: + interval_control = len(self.transformer_blocks) / len(controlnet_block_samples) + interval_control = int(np.ceil(interval_control)) + hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control] + + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + for index_block, block in enumerate(self.single_transformer_blocks): + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + temb, + image_rotary_emb, + **ckpt_kwargs, + ) + + else: + hidden_states = block( + hidden_states=hidden_states, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) + + # controlnet residual + if controlnet_single_block_samples is not None: + interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples) + interval_control = int(np.ceil(interval_control)) + hidden_states[:, encoder_hidden_states.shape[1] :, ...] = ( + hidden_states[:, encoder_hidden_states.shape[1] :, ...] + + controlnet_single_block_samples[index_block // interval_control] + ) + + hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...] + + + hidden_states = self.project_from_hidden_norm(hidden_states) + hidden_states = self.project_from_hidden(hidden_states) + + + hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2) + + hidden_states = self.up_block(hidden_states) + + if USE_PEFT_BACKEND: + # remove `lora_scale` from each PEFT layer + unscale_lora_layers(self, lora_scale) + + output = self.mlm_layer(hidden_states) + # self.unfuse_qkv_projections() + if not return_dict: + return (output,) + + + return output \ No newline at end of file diff --git a/Meissonic/src/transformer_video.py b/Meissonic/src/transformer_video.py new file mode 100644 index 0000000000000000000000000000000000000000..ff827875e0c6e0d0066701c75160419601667041 --- /dev/null +++ b/Meissonic/src/transformer_video.py @@ -0,0 +1,1204 @@ + +import math +from typing import Optional + +import torch +import torch.nn as nn +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.modeling_utils import ModelMixin + +# Global debug flag - set to False to disable debug prints +DEBUG_TRANSFORMER = False + +# from .attention import flash_attention +import torch + +try: + import flash_attn_interface + FLASH_ATTN_3_AVAILABLE = True +except ModuleNotFoundError: + FLASH_ATTN_3_AVAILABLE = False + +try: + import flash_attn + FLASH_ATTN_2_AVAILABLE = True +except ModuleNotFoundError: + FLASH_ATTN_2_AVAILABLE = False + +import warnings + +__all__ = [ + 'flash_attention', + 'attention', +] + + +def flash_attention( + q, + k, + v, + q_lens=None, + k_lens=None, + dropout_p=0., + softmax_scale=None, + q_scale=None, + causal=False, + window_size=(-1, -1), + deterministic=False, + dtype=torch.bfloat16, + version=None, +): + """ + q: [B, Lq, Nq, C1]. + k: [B, Lk, Nk, C1]. + v: [B, Lk, Nk, C2]. Nq must be divisible by Nk. + q_lens: [B]. + k_lens: [B]. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + causal: bool. Whether to apply causal attention mask. + window_size: (left right). If not (-1, -1), apply sliding window local attention. + deterministic: bool. If True, slightly slower and uses more memory. + dtype: torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16. + """ + half_dtypes = (torch.float16, torch.bfloat16) + assert dtype in half_dtypes + assert q.device.type == 'cuda' and q.size(-1) <= 256 + + # params + b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype + + def half(x): + return x if x.dtype in half_dtypes else x.to(dtype) + + # preprocess query + if q_lens is None: + q = half(q.flatten(0, 1)) + q_lens = torch.tensor( + [lq] * b, dtype=torch.int32).to( + device=q.device, non_blocking=True) + else: + q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)])) + + # preprocess key, value + if k_lens is None: + k = half(k.flatten(0, 1)) + v = half(v.flatten(0, 1)) + k_lens = torch.tensor( + [lk] * b, dtype=torch.int32).to( + device=k.device, non_blocking=True) + else: + k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)])) + v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)])) + + q = q.to(v.dtype) + k = k.to(v.dtype) + + if q_scale is not None: + q = q * q_scale + + if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE: + warnings.warn( + 'Flash attention 3 is not available, use flash attention 2 instead.' + ) + + # apply attention + if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE: + # Note: dropout_p, window_size are not supported in FA3 now. + x = flash_attn_interface.flash_attn_varlen_func( + q=q, + k=k, + v=v, + cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + seqused_q=None, + seqused_k=None, + max_seqlen_q=lq, + max_seqlen_k=lk, + softmax_scale=softmax_scale, + causal=causal, + deterministic=deterministic)[0].unflatten(0, (b, lq)) + else: + assert FLASH_ATTN_2_AVAILABLE + x = flash_attn.flash_attn_varlen_func( + q=q, + k=k, + v=v, + cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum( + 0, dtype=torch.int32).to(q.device, non_blocking=True), + max_seqlen_q=lq, + max_seqlen_k=lk, + dropout_p=dropout_p, + softmax_scale=softmax_scale, + causal=causal, + window_size=window_size, + deterministic=deterministic).unflatten(0, (b, lq)) + + # output + return x.type(out_dtype) + + +def attention( + q, + k, + v, + q_lens=None, + k_lens=None, + dropout_p=0., + softmax_scale=None, + q_scale=None, + causal=False, + window_size=(-1, -1), + deterministic=False, + dtype=torch.bfloat16, + fa_version=None, +): + if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE: + return flash_attention( + q=q, + k=k, + v=v, + q_lens=q_lens, + k_lens=k_lens, + dropout_p=dropout_p, + softmax_scale=softmax_scale, + q_scale=q_scale, + causal=causal, + window_size=window_size, + deterministic=deterministic, + dtype=dtype, + version=fa_version, + ) + else: + if q_lens is not None or k_lens is not None: + warnings.warn( + 'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.' + ) + attn_mask = None + + q = q.transpose(1, 2).to(dtype) + k = k.transpose(1, 2).to(dtype) + v = v.transpose(1, 2).to(dtype) + + out = torch.nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p) + + out = out.transpose(1, 2).contiguous() + return out + + +__all__ = ['WanModel'] + + +def sinusoidal_embedding_1d(dim, position): + # preprocess + assert dim % 2 == 0 + half = dim // 2 + # Ensure position is on CPU for float64 computation to avoid CUDA issues + # Convert to float64 for precision, then move back to original device + device = position.device + position = position.to(torch.float64) + + # calculation + # Create range tensor on same device as position + arange_tensor = torch.arange(half, dtype=torch.float64, device=device) + sinusoid = torch.outer( + position, torch.pow(10000, -arange_tensor.div(half))) + x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1) + return x + + +@torch.amp.autocast('cuda', enabled=False) +def rope_params(max_seq_len, dim, theta=10000): + assert dim % 2 == 0 + freqs = torch.outer( + torch.arange(max_seq_len), + 1.0 / torch.pow(theta, + torch.arange(0, dim, 2).to(torch.float64).div(dim))) + freqs = torch.polar(torch.ones_like(freqs), freqs) + return freqs + + +@torch.amp.autocast('cuda', enabled=False) +def rope_apply(x, grid_sizes, freqs): + n, c = x.size(2), x.size(3) // 2 + # Save original dtype to restore it later + original_dtype = x.dtype + + # split freqs + freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) + + # loop over samples + output = [] + for i, (f, h, w) in enumerate(grid_sizes.tolist()): + seq_len = f * h * w + + # precompute multipliers + x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape( + seq_len, n, -1, 2)) + freqs_i = torch.cat([ + freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1), + freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1), + freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1) + ], + dim=-1).reshape(seq_len, 1, -1) + + # apply rotary embedding + x_i = torch.view_as_real(x_i * freqs_i).flatten(2) + # Convert back to original dtype before concatenating + x_i = x_i.to(dtype=original_dtype) + # Handle the remaining part of the sequence + x_remaining = x[i, seq_len:] + if x_remaining.numel() > 0: + x_i = torch.cat([x_i, x_remaining]) + else: + x_i = x_i + + # append to collection + output.append(x_i) + # Stack and ensure dtype matches original input + return torch.stack(output).to(dtype=original_dtype) + + +class WanRMSNorm(nn.Module): + + def __init__(self, dim, eps=1e-5): + super().__init__() + self.dim = dim + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + r""" + Args: + x(Tensor): Shape [B, L, C] + """ + # Ensure weight dtype matches input dtype + return self._norm(x.float()).type_as(x) * self.weight.type_as(x) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps) + + +class WanLayerNorm(nn.LayerNorm): + + def __init__(self, dim, eps=1e-6, elementwise_affine=False): + super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps) + + def forward(self, x): + r""" + Args: + x(Tensor): Shape [B, L, C] + """ + # Convert to float32 for numerical stability, ensuring weights match input dtype + original_dtype = x.dtype + x_float = x.float() + if self.elementwise_affine: + weight_float = self.weight.float() if self.weight is not None else None + bias_float = self.bias.float() if self.bias is not None else None + # Use torch.nn.functional.layer_norm directly with converted weights + result = torch.nn.functional.layer_norm(x_float, self.normalized_shape, weight_float, bias_float, self.eps) + else: + result = super().forward(x_float) + return result.to(dtype=original_dtype) + + +class WanSelfAttention(nn.Module): + + def __init__(self, + dim, + num_heads, + window_size=(-1, -1), + qk_norm=True, + eps=1e-6): + assert dim % num_heads == 0 + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.window_size = window_size + self.qk_norm = qk_norm + self.eps = eps + + # layers + self.q = nn.Linear(dim, dim) + self.k = nn.Linear(dim, dim) + self.v = nn.Linear(dim, dim) + self.o = nn.Linear(dim, dim) + self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity() + self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity() + + def forward(self, x, seq_lens, grid_sizes, freqs): + r""" + Args: + x(Tensor): Shape [B, L, num_heads, C / num_heads] + seq_lens(Tensor): Shape [B] + grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W) + freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] + """ + b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim + + # query, key, value function + def qkv_fn(x): + q = self.norm_q(self.q(x)).view(b, s, n, d) + k = self.norm_k(self.k(x)).view(b, s, n, d) + v = self.v(x).view(b, s, n, d) + return q, k, v + + q, k, v = qkv_fn(x) + + # Save input dtype to ensure output matches + input_dtype = x.dtype + + x = flash_attention( + q=rope_apply(q, grid_sizes, freqs), + k=rope_apply(k, grid_sizes, freqs), + v=v, + k_lens=seq_lens, + window_size=self.window_size) + + # Ensure output dtype matches input dtype (in case rope_apply or flash_attention changed it) + x = x.to(dtype=input_dtype) + + # output + x = x.flatten(2) + x = self.o(x) + return x + + +class WanCrossAttention(WanSelfAttention): + + def forward(self, x, context, context_lens): + r""" + Args: + x(Tensor): Shape [B, L1, C] + context(Tensor): Shape [B, L2, C] + context_lens(Tensor): Shape [B] + """ + b, n, d = x.size(0), self.num_heads, self.head_dim + + # Save input dtype to ensure output matches + input_dtype = x.dtype + + # compute query, key, value + q = self.norm_q(self.q(x)).view(b, -1, n, d) + k = self.norm_k(self.k(context)).view(b, -1, n, d) + v = self.v(context).view(b, -1, n, d) + + # compute attention + x = flash_attention(q, k, v, k_lens=context_lens) + + # Ensure output dtype matches input dtype + x = x.to(dtype=input_dtype) + + # output + x = x.flatten(2) + x = self.o(x) + return x + + +class WanAttentionBlock(nn.Module): + + def __init__(self, + dim, + ffn_dim, + num_heads, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=False, + eps=1e-6): + super().__init__() + self.dim = dim + self.ffn_dim = ffn_dim + self.num_heads = num_heads + self.window_size = window_size + self.qk_norm = qk_norm + self.cross_attn_norm = cross_attn_norm + self.eps = eps + + # layers + self.norm1 = WanLayerNorm(dim, eps) + self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, + eps) + self.norm3 = WanLayerNorm( + dim, eps, + elementwise_affine=True) if cross_attn_norm else nn.Identity() + self.cross_attn = WanCrossAttention(dim, num_heads, (-1, -1), qk_norm, + eps) + self.norm2 = WanLayerNorm(dim, eps) + self.ffn = nn.Sequential( + nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'), + nn.Linear(ffn_dim, dim)) + + # modulation + self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5) + + def forward( + self, + x, + e, + seq_lens, + grid_sizes, + freqs, + context, + context_lens, + ): + r""" + Args: + x(Tensor): Shape [B, L, C] + e(Tensor): Shape [B, L1, 6, C] + seq_lens(Tensor): Shape [B], length of each sequence in batch + grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W) + freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] + """ + # Convert e to float32 for modulation computation (modulation expects float32) + e_float32 = e.to(dtype=torch.float32) if e.dtype != torch.float32 else e + with torch.amp.autocast('cuda', dtype=torch.float32): + e = (self.modulation.unsqueeze(0) + e_float32).chunk(6, dim=2) + assert e[0].dtype == torch.float32 + + # self-attention + # Ensure input dtype matches model weights (convert e to match x's dtype) + x_dtype = x.dtype + e_0 = e[0].squeeze(2).to(dtype=x_dtype) + e_1 = e[1].squeeze(2).to(dtype=x_dtype) + e_2 = e[2].squeeze(2).to(dtype=x_dtype) + attn_input = self.norm1(x) * (1 + e_1) + e_0 + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + # Ensure dtype consistency: y and e_2 should match x's dtype + x = x + (y * e_2).to(dtype=x_dtype) + + # cross-attention & ffn function + def cross_attn_ffn(x, context, context_lens, e): + x = x + self.cross_attn(self.norm3(x), context, context_lens) + # Ensure dtype consistency for FFN input + x_dtype = x.dtype + e_3 = e[3].squeeze(2).to(dtype=x_dtype) + e_4 = e[4].squeeze(2).to(dtype=x_dtype) + e_5 = e[5].squeeze(2).to(dtype=x_dtype) + ffn_input = self.norm2(x) * (1 + e_4) + e_3 + y = self.ffn(ffn_input) + # Ensure dtype consistency: y and e_5 should match x's dtype + x = x + (y * e_5).to(dtype=x_dtype) + return x + + x = cross_attn_ffn(x, context, context_lens, e) + return x + + +class Head(nn.Module): + + def __init__(self, dim, out_dim, patch_size, eps=1e-6): + super().__init__() + self.dim = dim + self.out_dim = out_dim + self.patch_size = patch_size + self.eps = eps + + # layers + out_dim = math.prod(patch_size) * out_dim + self.norm = WanLayerNorm(dim, eps) + self.head = nn.Linear(dim, out_dim) + + # modulation + self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5) + + def forward(self, x, e): + r""" + Args: + x(Tensor): Shape [B, L1, C] + e(Tensor): Shape [B, L1, C] + """ + # Convert e to float32 for modulation computation (modulation expects float32) + e_float32 = e.to(dtype=torch.float32) if e.dtype != torch.float32 else e + with torch.amp.autocast('cuda', dtype=torch.float32): + e = (self.modulation.unsqueeze(0) + e_float32.unsqueeze(2)).chunk(2, dim=2) + # Ensure dtype consistency: convert e to match x's dtype + x_dtype = x.dtype + e_0 = e[0].squeeze(2).to(dtype=x_dtype) + e_1 = e[1].squeeze(2).to(dtype=x_dtype) + head_input = self.norm(x) * (1 + e_1) + e_0 + x = self.head(head_input) + return x + + +class WanModel(ModelMixin, ConfigMixin): + r""" + Wan diffusion backbone supporting both text-to-video and image-to-video. + """ + + ignore_for_config = [ + 'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size' + ] + _no_split_modules = ['WanAttentionBlock'] + + @register_to_config + def __init__(self, + model_type='t2v', + patch_size=(1, 2, 2), + text_len=512, + in_dim=16, + dim=2048, + ffn_dim=8192, + freq_dim=256, + text_dim=4096, + out_dim=16, + num_heads=16, + num_layers=32, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6): + r""" + Initialize the diffusion model backbone. + + Args: + model_type (`str`, *optional*, defaults to 't2v'): + Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video) + patch_size (`tuple`, *optional*, defaults to (1, 2, 2)): + 3D patch dimensions for video embedding (t_patch, h_patch, w_patch) + text_len (`int`, *optional*, defaults to 512): + Fixed length for text embeddings + in_dim (`int`, *optional*, defaults to 16): + Input video channels (C_in) + dim (`int`, *optional*, defaults to 2048): + Hidden dimension of the transformer + ffn_dim (`int`, *optional*, defaults to 8192): + Intermediate dimension in feed-forward network + freq_dim (`int`, *optional*, defaults to 256): + Dimension for sinusoidal time embeddings + text_dim (`int`, *optional*, defaults to 4096): + Input dimension for text embeddings + out_dim (`int`, *optional*, defaults to 16): + Output video channels (C_out) + num_heads (`int`, *optional*, defaults to 16): + Number of attention heads + num_layers (`int`, *optional*, defaults to 32): + Number of transformer blocks + window_size (`tuple`, *optional*, defaults to (-1, -1)): + Window size for local attention (-1 indicates global attention) + qk_norm (`bool`, *optional*, defaults to True): + Enable query/key normalization + cross_attn_norm (`bool`, *optional*, defaults to False): + Enable cross-attention normalization + eps (`float`, *optional*, defaults to 1e-6): + Epsilon value for normalization layers + """ + + super().__init__() + + assert model_type in ['t2v', 'i2v', 'ti2v', 's2v'] + self.model_type = model_type + + self.patch_size = patch_size + self.text_len = text_len + self.in_dim = in_dim + self.dim = dim + self.ffn_dim = ffn_dim + self.freq_dim = freq_dim + self.text_dim = text_dim + self.out_dim = out_dim + self.num_heads = num_heads + self.num_layers = num_layers + self.window_size = window_size + self.qk_norm = qk_norm + self.cross_attn_norm = cross_attn_norm + self.eps = eps + + # embeddings + self.patch_embedding = nn.Conv3d( + in_dim, dim, kernel_size=patch_size, stride=patch_size) + self.text_embedding = nn.Sequential( + nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'), + nn.Linear(dim, dim)) + + self.time_embedding = nn.Sequential( + nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)) + self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6)) + + # blocks + self.blocks = nn.ModuleList([ + WanAttentionBlock(dim, ffn_dim, num_heads, window_size, qk_norm, + cross_attn_norm, eps) for _ in range(num_layers) + ]) + + # head + self.head = Head(dim, out_dim, patch_size, eps) + + # buffers (don't use register_buffer otherwise dtype will be changed in to()) + assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0 + d = dim // num_heads + self.freqs = torch.cat([ + rope_params(1024, d - 4 * (d // 6)), + rope_params(1024, 2 * (d // 6)), + rope_params(1024, 2 * (d // 6)) + ], + dim=1) + + # initialize weights + self.init_weights() + + def forward( + self, + x, + t, + context, + seq_len, + y=None, + ): + r""" + Forward pass through the diffusion model + + Args: + x (List[Tensor]): + List of input video tensors, each with shape [C_in, F, H, W] + t (Tensor): + Diffusion timesteps tensor of shape [B] + context (List[Tensor]): + List of text embeddings each with shape [L, C] + seq_len (`int`): + Maximum sequence length for positional encoding + y (List[Tensor], *optional*): + Conditional video inputs for image-to-video mode, same shape as x + + Returns: + List[Tensor]: + List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8] + """ + if self.model_type == 'i2v': + assert y is not None + # params + device = self.patch_embedding.weight.device + if self.freqs.device != device: + self.freqs = self.freqs.to(device) + + if y is not None: + x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)] + + # embeddings + # Ensure input dtype matches patch_embedding weight dtype + patch_weight_dtype = self.patch_embedding.weight.dtype + x = [self.patch_embedding(u.unsqueeze(0).to(dtype=patch_weight_dtype)) for u in x] + grid_sizes = torch.stack( + [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]) + x = [u.flatten(2).transpose(1, 2) for u in x] + seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long) + assert seq_lens.max() <= seq_len + x = torch.cat([ + torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], + dim=1) for u in x + ]) + + # time embeddings + if t.dim() == 1: + t = t.expand(t.size(0), seq_len) + with torch.amp.autocast('cuda', dtype=torch.float32): + bt = t.size(0) + t = t.flatten() + e = self.time_embedding( + sinusoidal_embedding_1d(self.freq_dim, + t).unflatten(0, (bt, seq_len)).float()) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + assert e.dtype == torch.float32 and e0.dtype == torch.float32 + + # Keep e and e0 as float32 for modulation computation + # They will be converted to x.dtype inside WanAttentionBlock.forward and Head.forward when needed + + # context + context_lens = None + # Ensure context input dtype matches text_embedding weight dtype + text_weight_dtype = self.text_embedding[0].weight.dtype + context = self.text_embedding( + torch.stack([ + torch.cat( + [u, u.new_zeros(self.text_len - u.size(0), u.size(1))]) + for u in context + ]).to(dtype=text_weight_dtype)) + + # arguments + kwargs = dict( + e=e0, + seq_lens=seq_lens, + grid_sizes=grid_sizes, + freqs=self.freqs, + context=context, + context_lens=context_lens) + + for block in self.blocks: + x = block(x, **kwargs) + + # head + x = self.head(x, e) + + # unpatchify + x = self.unpatchify(x, grid_sizes) + return [u.float() for u in x] + + def unpatchify(self, x, grid_sizes): + r""" + Reconstruct video tensors from patch embeddings. + + Args: + x (List[Tensor]): + List of patchified features, each with shape [L, C_out * prod(patch_size)] + grid_sizes (Tensor): + Original spatial-temporal grid dimensions before patching, + shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches) + + Returns: + List[Tensor]: + Reconstructed video tensors with shape [C_out, F, H / 8, W / 8] + """ + + c = self.out_dim + out = [] + for u, v in zip(x, grid_sizes.tolist()): + u = u[:math.prod(v)].view(*v, *self.patch_size, c) + u = torch.einsum('fhwpqrc->cfphqwr', u) + u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)]) + out.append(u) + return out + + def init_weights(self): + r""" + Initialize model parameters using Xavier initialization. + """ + + # basic init + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + + # init embeddings + nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1)) + for m in self.text_embedding.modules(): + if isinstance(m, nn.Linear): + nn.init.normal_(m.weight, std=.02) + for m in self.time_embedding.modules(): + if isinstance(m, nn.Linear): + nn.init.normal_(m.weight, std=.02) + + # init output layer + nn.init.zeros_(self.head.head.weight) + + + + +class WanDiscreteVideoTransformer(ModelMixin, ConfigMixin): + r""" + Wrapper around :class:`WanModel` that makes it usable as a **discrete video diffusion backbone**. + + The goals of this wrapper are: + + - keep the inner :class:`WanModel` architecture and parameter names intact so that Wan-1.3B + weights can later be loaded directly into ``self.backbone``; + - expose a simpler interface that takes **discrete codebook indices** (from a 2D VQ-VAE on + pseudo-video) and returns **logits over the codebook** for each spatio‑temporal position. + + Notes + ----- + - This class does **not** try to be drop‑in compatible with Meissonic's 2D ``Transformer2DModel``. + It is a parallel, video‑oriented path that still follows the same *discrete diffusion* principle: + predict per‑token logits given masked tokens + text. + - Pseudo‑video is represented as a 4D integer tensor ``[B, F, H, W]`` of codebook indices. + How to get these tokens from the current 2D VQ-VAE (e.g. per‑frame encoding & stacking) + is left to the higher‑level training / pipeline code. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + # discrete codebook settings + codebook_size: int, + vocab_size: int, + # video layout + num_frames: int, + height: int, + width: int, + # Wan backbone hyper‑parameters (mirrors WanModel.__init__) + model_type: str = 't2v', + patch_size: tuple = (1, 2, 2), + text_len: int = 512, + in_dim: int = 16, + dim: int = 2048, + ffn_dim: int = 8192, + freq_dim: int = 256, + text_dim: int = 4096, + out_dim: int = 16, + num_heads: int = 16, + num_layers: int = 32, + window_size: tuple = (-1, -1), + qk_norm: bool = True, + cross_attn_norm: bool = True, + eps: float = 1e-6, + ): + super().__init__() + + # save a minimal set of attributes useful for downstream tooling + self.codebook_size = codebook_size + self.vocab_size = vocab_size + self.num_frames = num_frames + self.height = height + self.width = width + + # 1) backbone: keep WanModel intact for future weight loading + self.backbone = WanModel( + model_type=model_type, + patch_size=patch_size, + text_len=text_len, + in_dim=in_dim, + dim=dim, + ffn_dim=ffn_dim, + freq_dim=freq_dim, + text_dim=text_dim, + out_dim=out_dim, + num_heads=num_heads, + num_layers=num_layers, + window_size=window_size, + qk_norm=qk_norm, + cross_attn_norm=cross_attn_norm, + eps=eps, + ) + + # 2) discrete token embedding -> continuous video volume + # + # Input: tokens [B, F, H, W] with values in [0, vocab_size) where: + # - [0, codebook_size-1] = actual Cosmos codes (direct mapping, no shift) + # - codebook_size = mask_token_id (reserved for masking) + # Output: list of length B with tensors [in_dim, F, H, W] + # + # We keep this outside the backbone so that loading official Wan 1.3B weights + # into self.backbone will still work without clashes. + # Note: vocab_size = codebook_size + 1 to accommodate mask_token_id = codebook_size + self.token_embedding = nn.Embedding(vocab_size, in_dim) + + # 3) projection from continuous video output -> logits over codebook + # + # Backbone output: list of B tensors [out_dim, F, H', W'] + # We map it with a 3D 1x1x1 conv to [vocab_size, F, H', W']. + # Note: vocab_size = codebook_size + 1, where codebook_size is reserved for mask_token_id + self.logits_head = nn.Conv3d(out_dim, vocab_size, kernel_size=1) + + # Gradient checkpointing support + self.gradient_checkpointing = False + + def _tokens_to_video(self, tokens: torch.LongTensor) -> list: + r""" + Convert discrete tokens ``[B, F, H, W]`` into a list of length ``B`` where each element + is a dense video tensor ``[in_dim, F, H, W]`` suitable for :class:`WanModel`. + + Note: + This method now supports dynamic input dimensions. The num_frames, height, width + stored in config are used as defaults/for seq_len calculation, but inputs can + have different dimensions as long as they're valid. + """ + assert tokens.dim() == 4, f"expected [B, F, H, W] tokens, got {tokens.shape}" + # Dynamic dimensions - no strict dimension checks, WanModel handles variable sizes + + # [B, F, H, W, in_dim] + # Ensure output dtype matches token_embedding weight dtype + x = self.token_embedding(tokens) + # Ensure dtype matches model's expected dtype (usually bfloat16 for mixed precision) + token_embedding_dtype = self.token_embedding.weight.dtype + x = x.to(dtype=token_embedding_dtype) + # [B, in_dim, F, H, W] + x = x.permute(0, 4, 1, 2, 3).contiguous() + + # WanModel expects a list of [C_in, F, H, W] + return [x_i for x_i in x] + + def _text_to_list(self, encoder_hidden_states: torch.Tensor) -> list: + r""" + Convert batched text embeddings ``[B, L, C]`` into the list-of-tensors format + expected by :class:`WanModel`. + """ + assert encoder_hidden_states.dim() == 3, ( + f"expected encoder_hidden_states [B, L, C], got {encoder_hidden_states.shape}") + return [e for e in encoder_hidden_states] + + def _set_gradient_checkpointing(self, enable=True, gradient_checkpointing_func=None): + """Set gradient checkpointing for the module.""" + self.gradient_checkpointing = enable + + def forward( + self, + tokens: torch.LongTensor, + timesteps: torch.LongTensor, + encoder_hidden_states: torch.FloatTensor, + y: Optional[list] = None, + ) -> torch.FloatTensor: + r""" + Forward pass of the **discrete video transformer**. + + Args: + tokens (`torch.LongTensor` of shape `[B, F, H, W]`): + Discrete codebook indices (e.g. from a 2D VQ-VAE applied frame‑wise). + timesteps (`torch.LongTensor` of shape `[B]` or `[B, F * H * W]`): + Diffusion timestep(s), following the same semantics as Meissonic's scalar timesteps. + encoder_hidden_states (`torch.FloatTensor` of shape `[B, L, C_text]`): + Text embeddings (e.g. from CLIP). Each sample corresponds to one video. + y (`Optional[list]`): + Optional conditional video list passed to the underlying :class:`WanModel` + for i2v / ti2v / s2v variants. For now this is surfaced as a raw passthrough + and can be left as ``None`` for pure text‑to‑video. + + Returns: + `torch.FloatTensor`: + Logits over the codebook of shape `[B, codebook_size, F, H_out, W_out]`, where + `(H_out, W_out)` depend on the Wan patch configuration. For the default + `patch_size=(1, 2, 2)` and input ``H=W=height``, we have + ``H_out = height // 2`` and ``W_out = width // 2``. + """ + device = tokens.device + if DEBUG_TRANSFORMER: + print(f"[DEBUG-transformer] Input: tokens.shape={tokens.shape}, encoder_hidden_states.shape={encoder_hidden_states.shape}, timesteps.shape={timesteps.shape}") + x_list = self._tokens_to_video(tokens) + context_list = self._text_to_list(encoder_hidden_states) + if DEBUG_TRANSFORMER: + print(f"[DEBUG-transformer] After conversion: len(x_list)={len(x_list)}, len(context_list)={len(context_list)}") + if len(x_list) > 0: + print(f"[DEBUG-transformer] x_list[0].shape={x_list[0].shape}") + if len(context_list) > 0: + print(f"[DEBUG-transformer] context_list[0].shape={context_list[0].shape}") + + # Calculate seq_len from actual input dimensions (supports dynamic sizes) + # tokens: [B, F, H, W] -> after patchification: seq_len = F * (H/p_h) * (W/p_w) + _, f_in, h_in, w_in = tokens.shape + h_patch = h_in // self.backbone.patch_size[1] + w_patch = w_in // self.backbone.patch_size[2] + seq_len = f_in * h_patch * w_patch + + # Prepare timesteps in the exact shape WanModel.forward expects. + # Its current implementation assumes `t` is either [B, seq_len] or will be + # expanded from 1D; the 1D branch is slightly buggy for non-singleton dims, + # so we always give it a [B, seq_len] tensor here. + if timesteps.dim() == 1: + # [B] -> [B, 1] -> [B, seq_len] (broadcast along sequence) + t_model = timesteps.to(device).unsqueeze(1).expand(-1, seq_len) + elif timesteps.dim() == 2: + assert timesteps.size(1) == seq_len, ( + f"Expected timesteps second dim == seq_len ({seq_len}), " + f"but got {timesteps.size(1)}" + ) + t_model = timesteps.to(device) + else: + raise ValueError( + f"Unsupported timesteps shape {timesteps.shape}; " + "expected [B] or [B, seq_len]" + ) + if DEBUG_TRANSFORMER: + print(f"[DEBUG-transformer] t_model.shape={t_model.shape}") + + # WanModel.forward expects: + # x: List[Tensor [C_in, F, H, W]] + # t: Tensor [B] or [B, seq_len] + # context: List[Tensor [L, C_text]] + # seq_len: int + # y: Optional[List[Tensor]] + if self.training and self.gradient_checkpointing: + def create_custom_forward(module): + def custom_forward(*inputs): + # Unpack inputs: x_list, t, context_list, seq_len, y + x_in, t_in, context_in, seq_len_in, y_in = inputs + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + return custom_forward + + # Use gradient checkpointing for the backbone + ckpt_kwargs = {"use_reentrant": False} + out_list = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.backbone), + x_list, + t_model, + context_list, + seq_len, + y, + **ckpt_kwargs, + ) + else: + out_list = self.backbone( + x=x_list, + t=t_model, + context=context_list, + seq_len=seq_len, + y=y, + ) + if DEBUG_TRANSFORMER: + print(f"[DEBUG-transformer] After backbone: len(out_list)={len(out_list)}") + if len(out_list) > 0: + print(f"[DEBUG-transformer] out_list[0].shape={out_list[0].shape}") + + # out_list: length B, each [C_out, F, H_out, W_out] + vids = torch.stack(out_list, dim=0) # [B, C_out, F, H_out, W_out] + if DEBUG_TRANSFORMER: + print(f"[DEBUG-transformer] After stack: vids.shape={vids.shape}") + # Ensure vids dtype matches logits_head weight dtype + vids = vids.to(dtype=self.logits_head.weight.dtype) + logits = self.logits_head(vids) # [B, vocab_size, F, H_out, W_out] where vocab_size = codebook_size + 1 + if DEBUG_TRANSFORMER: + print(f"[DEBUG-transformer] Final logits.shape={logits.shape}") + return logits + +# def _available_device(): +# return "cuda" if torch.cuda.is_available() else "cpu" + + +# def test_wan_discrete_video_transformer_forward_and_shapes(): +# """ +# Basic smoke test: +# - build a tiny WanDiscreteVideoTransformer +# - run a forward pass with random pseudo-video tokens + random text +# - check output shapes, parameter count and (if CUDA present) memory usage +# """ + +# device = _available_device() + +# # small config to keep the test lightweight +# codebook_size = 128 +# vocab_size = codebook_size + 1 # reserve one for mask if needed later +# num_frames = 2 +# height = 16 +# width = 16 + +# model = WanDiscreteVideoTransformer( +# codebook_size=codebook_size, +# vocab_size=vocab_size, +# num_frames=num_frames, +# height=height, +# width=width, +# # shrink Wan backbone for the unit test +# in_dim=32, +# dim=64, +# ffn_dim=128, +# freq_dim=32, +# text_dim=64, +# out_dim=32, +# num_heads=4, +# num_layers=2, +# ).to(device) +# model.eval() + +# batch_size = 2 + +# # pseudo-video tokens from 2D VQ-VAE on frames: [B, F, H, W] +# tokens = torch.randint( +# low=0, +# high=codebook_size, +# size=(batch_size, num_frames, height, width), +# dtype=torch.long, +# device=device, +# ) + +# # text: [B, L, C_text] +# text_seq_len = 8 +# encoder_hidden_states = torch.randn( +# batch_size, text_seq_len, model.backbone.text_dim, device=device +# ) + +# # timesteps: [B] +# timesteps = torch.randint( +# low=0, high=1000, size=(batch_size,), dtype=torch.long, device=device +# ) + +# # track memory if CUDA is available +# if device == "cuda": +# torch.cuda.reset_peak_memory_stats() +# mem_before = torch.cuda.memory_allocated() +# else: +# mem_before = 0 + +# with torch.no_grad(): +# logits = model( +# tokens=tokens, +# timesteps=timesteps, +# encoder_hidden_states=encoder_hidden_states, +# y=None, +# ) + +# if device == "cuda": +# mem_after = torch.cuda.memory_allocated() +# peak_mem = torch.cuda.max_memory_allocated() +# else: +# mem_after = mem_before +# peak_mem = mem_before + +# # logits: [B, codebook_size, F, H_out, W_out] +# assert logits.shape[0] == batch_size +# assert logits.shape[1] == codebook_size +# assert logits.shape[2] == num_frames + +# # WanModel returns unpatchified videos, so spatial size matches the input grid. +# h_out = height +# w_out = width +# assert logits.shape[3] == h_out +# assert logits.shape[4] == w_out + +# # parameter count sanity check (just ensure it's > 0 and finite) +# num_params = sum(p.numel() for p in model.parameters()) +# assert num_params > 0 +# assert math.isfinite(float(num_params)) + +# # memory sanity check (on CUDA the forward pass should allocate > 0 bytes) +# if device == "cuda": +# assert peak_mem >= mem_after >= mem_before + + + +# import torch +# from safetensors import safe_open +# # from src.transformer_video import WanDiscreteVideoTransformer + +# ckpt_path = "/mnt/Meissonic/model/diffusion_pytorch_model.safetensors" + +# # 1) 按你想匹配 wan2.1 的超参实例化(这里写一份常用配置,务必与 ckpt 对齐) +# model = WanDiscreteVideoTransformer( +# codebook_size=128, # 离散侧自定义 +# vocab_size=129, +# num_frames=2, +# height=16, +# width=16, +# # Wan backbone 超参需与 ckpt 完全一致 +# model_type="t2v", +# patch_size=(1, 2, 2), +# in_dim=16, +# dim=1536, +# ffn_dim=8960, +# freq_dim=256, +# text_dim=4096, +# out_dim=16, +# num_heads=12, +# num_layers=30, +# window_size=(-1, -1), +# qk_norm=True, +# cross_attn_norm=True, +# eps=1e-6, +# ) + +# # 2) 读取 safetensors +# state_dict = {} +# with safe_open(ckpt_path, framework="pt", device="cpu") as f: +# for k in f.keys(): +# state_dict[k] = f.get_tensor(k) + +# # 3) 尝试加载到 backbone(不碰 token_embedding/logits_head) +# missing, unexpected = model.backbone.load_state_dict(state_dict, strict=False) + +# print("Missing keys:", missing[:50], "... total", len(missing)) +# print("Unexpected keys:", unexpected[:50], "... total", len(unexpected)) +# print("Backbone params (M):", sum(p.numel() for p in model.backbone.parameters()) / 1e6) +# print("Params (M):", sum(p.numel() for p in model.parameters()) / 1e6) + +# # if __name__ == '__main__': +# # # test_wan_discrete_video_transformer_forward_and_shapes() +# # print('WanDiscreteVideoTransformer forward pass test: PASSED') + + + + \ No newline at end of file diff --git a/Meissonic/tests/test.py b/Meissonic/tests/test.py new file mode 100644 index 0000000000000000000000000000000000000000..a93eee1f28259d6ad78c0ee8ba9157dfabdb5d54 --- /dev/null +++ b/Meissonic/tests/test.py @@ -0,0 +1,111 @@ +import math + +import torch + +from src.transformer_video import WanDiscreteVideoTransformer + + +def _available_device(): + return "cuda" if torch.cuda.is_available() else "cpu" + + +def test_wan_discrete_video_transformer_forward_and_shapes(): + """ + Basic smoke test: + - build a tiny WanDiscreteVideoTransformer + - run a forward pass with random pseudo-video tokens + random text + - check output shapes, parameter count and (if CUDA present) memory usage + """ + + device = _available_device() + + # small config to keep the test lightweight + codebook_size = 128 + vocab_size = codebook_size + 1 # reserve one for mask if needed later + num_frames = 2 + height = 16 + width = 16 + + model = WanDiscreteVideoTransformer( + codebook_size=codebook_size, + vocab_size=vocab_size, + num_frames=num_frames, + height=height, + width=width, + # shrink Wan backbone for the unit test + in_dim=32, + dim=64, + ffn_dim=128, + freq_dim=32, + text_dim=64, + out_dim=32, + num_heads=4, + num_layers=2, + ).to(device) + model.eval() + + batch_size = 2 + + # pseudo-video tokens from 2D VQ-VAE on frames: [B, F, H, W] + tokens = torch.randint( + low=0, + high=codebook_size, + size=(batch_size, num_frames, height, width), + dtype=torch.long, + device=device, + ) + + # text: [B, L, C_text] + text_seq_len = 8 + encoder_hidden_states = torch.randn( + batch_size, text_seq_len, model.backbone.text_dim, device=device + ) + + # timesteps: [B] + timesteps = torch.randint( + low=0, high=1000, size=(batch_size,), dtype=torch.long, device=device + ) + + # track memory if CUDA is available + if device == "cuda": + torch.cuda.reset_peak_memory_stats() + mem_before = torch.cuda.memory_allocated() + else: + mem_before = 0 + + with torch.no_grad(): + logits = model( + tokens=tokens, + timesteps=timesteps, + encoder_hidden_states=encoder_hidden_states, + y=None, + ) + + if device == "cuda": + mem_after = torch.cuda.memory_allocated() + peak_mem = torch.cuda.max_memory_allocated() + else: + mem_after = mem_before + peak_mem = mem_before + + # logits: [B, codebook_size, F, H_out, W_out] + assert logits.shape[0] == batch_size + assert logits.shape[1] == codebook_size + assert logits.shape[2] == num_frames + + # spatial size after Wan patch embedding with default patch_size (1, 2, 2) + h_out = height // model.backbone.patch_size[1] + w_out = width // model.backbone.patch_size[2] + assert logits.shape[3] == h_out + assert logits.shape[4] == w_out + + # parameter count sanity check (just ensure it's > 0 and finite) + num_params = sum(p.numel() for p in model.parameters()) + assert num_params > 0 + assert math.isfinite(float(num_params)) + + # memory sanity check (on CUDA the forward pass should allocate > 0 bytes) + if device == "cuda": + assert peak_mem >= mem_after >= mem_before + + diff --git a/Meissonic/train/FEATURE_EXTRACTION_README.md b/Meissonic/train/FEATURE_EXTRACTION_README.md new file mode 100644 index 0000000000000000000000000000000000000000..064843a220cf2442413f8d30ed72363e735a9f07 --- /dev/null +++ b/Meissonic/train/FEATURE_EXTRACTION_README.md @@ -0,0 +1,142 @@ +# 特征提取和预计算特征训练指南 + +本文档说明如何使用预提取的特征来加速视频训练。 + +## 概述 + +为了提升训练效率,我们可以预先提取: +1. **视频特征(codes)**:使用 CosmosVideoTokenizer 将视频编码为离散token +2. **文本特征(embeddings)**:使用 T5/UMT5 将文本编码为embedding + +训练时直接加载这些预提取的特征,避免每次训练都重新编码。 + +## 步骤1:提取特征 + +使用 `extract_features.py` 脚本提取特征: + +```bash +python train/extract_features.py \ + --csv_path /path/to/OpenVid1M_reorganized.csv \ + --video_root_dir /path/to/video_reorg \ + --output_dir /path/to/extracted_features \ + --text_encoder_architecture umt5-base \ + --video_tokenizer_model_id Cosmos-1.0-Tokenizer-DV8x16x16 \ + --num_frames 16 \ + --video_height 480 \ + --video_width 848 \ + --batch_size 4 \ + --num_workers 4 +``` + +### 参数说明 + +- `--csv_path`: OpenVid1M CSV文件路径 +- `--video_root_dir`: 视频文件根目录(可选,会自动检测) +- `--output_dir`: 特征保存目录 +- `--text_encoder_architecture`: 文本编码器架构(umt5-base/umt5-xxl/t5) +- `--video_tokenizer_model_id`: Cosmos视频tokenizer模型ID +- `--num_frames`, `--video_height`, `--video_width`: 视频参数 +- `--batch_size`: 批处理大小(根据GPU内存调整) +- `--num_workers`: 数据加载器工作进程数 +- `--max_samples`: 最大处理样本数(用于测试,默认处理全部) +- `--resume_from_index`: 从指定索引恢复提取(用于中断后恢复) + +### 输出结构 + +提取的特征会保存在 `output_dir` 下,采用三层目录结构以避免单个文件夹下文件过多: + +``` +extracted_features/ +├── video_codes/ # 视频codes(三层目录结构) +│ ├── 000/ # 第一层:index // 1000000 +│ │ ├── 000/ # 第二层:(index // 1000) % 1000 +│ │ │ ├── 000/ # 第三层:index % 1000 +│ │ │ │ ├── 00000000.npy +│ │ │ │ └── ... +│ │ │ └── ... +│ │ └── ... +│ └── ... +├── text_embeddings/ # 文本embeddings(三层目录结构) +│ ├── 000/ +│ │ ├── 000/ +│ │ │ ├── 000/ +│ │ │ │ ├── 00000000.npy +│ │ │ │ └── ... +│ │ │ └── ... +│ │ └── ... +│ └── ... +└── metadata.json # 元数据(包含样本信息) +``` + +**目录结构说明**: +- 对于索引 `index`,文件路径为:`level1/level2/level3/index.npy` + - `level1 = index // 1000000` (0-999) + - `level2 = (index // 1000) % 1000` (0-999) + - `level3 = index % 1000` (0-999) +- 例如:索引 `1234567` 的文件路径为 `001/234/567/1234567.npy` + +这种结构可以支持最多1,000,000,000个样本,每层最多1000个文件夹,避免单个文件夹下文件过多的问题。 + +## 步骤2:使用预提取特征训练 + +在训练脚本中使用 `--use_precomputed_features` 和 `--features_dir` 参数: + +```bash +python train/train_mei_video.py \ + --use_precomputed_features \ + --features_dir /path/to/extracted_features \ + --text_encoder_architecture umt5-base \ + --video_tokenizer_model_id Cosmos-1.0-Tokenizer-DV8x16x16 \ + --num_frames 16 \ + --video_height 480 \ + --video_width 848 \ + --train_batch_size 8 \ + --learning_rate 3e-4 \ + --max_train_steps 10000 \ + --output_dir ./output \ + --mixed_precision bf16 \ + --gradient_checkpointing \ + --wan_pretrained_path /path/to/wan/weights \ + --wan_backbone_lr_ratio 0.1 \ + --freeze_wan_backbone # 可选:冻结backbone +``` + +### 关键参数 + +- `--use_precomputed_features`: 启用预提取特征模式 +- `--features_dir`: 预提取特征的目录路径 +- 其他训练参数保持不变 + +## 优势 + +1. **训练速度提升**:避免每次训练都重新编码视频和文本 +2. **GPU利用率提升**:减少CPU-GPU数据传输 +3. **内存效率**:特征文件比原始视频小得多 +4. **可重复性**:使用相同的特征确保训练一致性 + +## 注意事项 + +1. **特征一致性**:确保提取特征时使用的参数(num_frames, height, width, text_encoder)与训练时一致 +2. **存储空间**:1M样本的特征大约需要几十GB存储空间 +3. **恢复提取**:如果提取中断,可以使用 `--resume_from_index` 参数恢复 +4. **验证**:训练时仍需要text_encoder进行验证,但不会用于训练数据编码 + +## 故障排除 + +### 特征提取失败 + +- 检查视频文件路径是否正确 +- 检查GPU内存是否足够(可以减小batch_size) +- 查看日志中的错误信息 + +### 训练时找不到特征 + +- 确认 `--features_dir` 路径正确 +- 确认特征文件存在(检查 `video_codes/` 和 `text_embeddings/` 目录) +- 检查 `metadata.json` 文件是否存在 + +### 维度不匹配 + +- 确保提取和训练时使用相同的视频参数(num_frames, height, width) +- 确保使用相同的text_encoder架构 + diff --git a/Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc b/Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a874bf50e8caec00c4a4bcf46b7268ea53440204 Binary files /dev/null and b/Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc differ diff --git a/Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc b/Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87f85d8ba8fe7583d7c8e1eef846e5150c99939e Binary files /dev/null and b/Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc differ diff --git a/Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc b/Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbe917b20ca139d7ec602b67b9089e1d3e83eeb7 Binary files /dev/null and b/Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc differ diff --git a/Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc b/Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7842410521abb09c0c50ac7720d0b275d2d48c0c Binary files /dev/null and b/Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc differ diff --git a/Meissonic/train/check_codebook_range.py b/Meissonic/train/check_codebook_range.py new file mode 100644 index 0000000000000000000000000000000000000000..3f015b106b650c0af6fe21e1e3e2819e5e954c19 --- /dev/null +++ b/Meissonic/train/check_codebook_range.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +""" +Check codebook range by iterating through videos and extracting codes. + +This script loads videos from the dataset, encodes them to get video codes, +and tracks the min/max values to determine the codebook range. +""" + +import argparse +import os +import sys +import logging +from tqdm import tqdm +import torch +import numpy as np + +sys.path.append(os.getcwd()) + +from train.dataset_utils import OpenVid1MDataset, PrecomputedFeatureDataset +from src.pipeline_video import CosmosVideoTokenizer +from transformers import T5Tokenizer +from torch.utils.data import DataLoader + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Check codebook range from video dataset") + + parser.add_argument( + "--csv_path", + type=str, + default=None, + help="Path to OpenVid1M CSV file (if using raw videos)", + ) + parser.add_argument( + "--video_root_dir", + type=str, + default=None, + help="Root directory containing video files", + ) + parser.add_argument( + "--features_dir", + type=str, + default=None, + help="Directory containing pre-extracted features (if using precomputed features)", + ) + parser.add_argument( + "--video_tokenizer_model_id", + type=str, + default="Cosmos-1.0-Tokenizer-DV8x16x16", + help="HuggingFace model ID for Cosmos video tokenizer", + ) + parser.add_argument( + "--num_frames", + type=int, + default=16, + help="Number of frames per video", + ) + parser.add_argument( + "--video_height", + type=int, + default=480, + help="Video height", + ) + parser.add_argument( + "--video_width", + type=int, + default=848, + help="Video width", + ) + parser.add_argument( + "--text_encoder_architecture", + type=str, + default="umt5-base", + choices=["umt5-base", "umt5-xxl", "t5"], + help="Text encoder architecture", + ) + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="Batch size (use 1 for detailed per-sample tracking)", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples to check. If None, check all.", + ) + parser.add_argument( + "--check_interval", + type=int, + default=10, + help="Print statistics every N samples", + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + dtype = torch.float32 + + logger.info(f"Using device: {device}") + + # Initialize video tokenizer (only needed if not using precomputed features) + video_tokenizer = None + use_precomputed = args.features_dir is not None + + if not use_precomputed: + if args.csv_path is None: + raise ValueError("Either --csv_path or --features_dir must be provided") + + logger.info(f"Loading video tokenizer: {args.video_tokenizer_model_id}") + video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=device, + dtype=dtype + ) + video_tokenizer.requires_grad_(False) + video_tokenizer.eval() + + # Get tokenizer info + logger.info(f"Video tokenizer codebook_size: {video_tokenizer.codebook_size}") + logger.info(f"Video tokenizer mask_token_id: {video_tokenizer.mask_token_id}") + + # Create dataset + if use_precomputed: + logger.info(f"Using precomputed features from: {args.features_dir}") + dataset = PrecomputedFeatureDataset( + features_dir=args.features_dir, + num_samples=args.max_samples, + ) + else: + # Auto-detect video_root_dir if not provided + if args.video_root_dir is None: + csv_dir = os.path.dirname(args.csv_path) + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + video_root_dir = csv_dir + logger.warning(f"Video directory not found, using CSV directory: {video_root_dir}") + else: + video_root_dir = args.video_root_dir + + # Create tokenizer for dataset + if args.text_encoder_architecture == "umt5-base": + model_id = "google/umt5-base" + elif args.text_encoder_architecture == "umt5-xxl": + model_id = "google/umt5-xxl" + elif args.text_encoder_architecture == "t5": + model_id = "t5-base" + else: + raise ValueError(f"Unknown text encoder: {args.text_encoder_architecture}") + + tokenizer = T5Tokenizer.from_pretrained(model_id) + + dataset = OpenVid1MDataset( + csv_path=args.csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture, + use_random_temporal_crop=False, # Fixed sampling for consistency + use_random_crop=False, # Center crop for consistency + ) + + if args.max_samples is not None: + dataset.data = dataset.data[:args.max_samples] + logger.info(f"Limited dataset to {len(dataset)} samples") + + logger.info(f"Dataset size: {len(dataset)}") + + # Create dataloader + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=False, + num_workers=0, # Use 0 to avoid multiprocessing issues + pin_memory=False, + ) + + # Initialize statistics + global_min = None + global_max = None + total_samples = 0 + failed_samples = 0 + + logger.info("Starting to check codebook range...") + logger.info("=" * 80) + + with torch.no_grad(): + for batch_idx, batch in enumerate(tqdm(dataloader, desc="Checking codes")): + try: + if use_precomputed: + # Use pre-extracted video codes + video_codes = batch["video_codes"] # [B, F', H', W'] + if isinstance(video_codes, torch.Tensor): + video_codes = video_codes.long() + else: + video_codes = torch.from_numpy(video_codes).long() + else: + # Encode videos to get codes + videos = batch["video"].to(device, non_blocking=True) # [B, C, F, H, W] + video_codes = video_tokenizer.encode(videos) # [B, F', H', W'] + video_codes = video_codes.cpu().long() + + # Update statistics + batch_min = video_codes.min().item() + batch_max = video_codes.max().item() + + if global_min is None: + global_min = batch_min + global_max = batch_max + else: + global_min = min(global_min, batch_min) + global_max = max(global_max, batch_max) + + total_samples += video_codes.shape[0] + + # Print statistics periodically + if (batch_idx + 1) % args.check_interval == 0 or batch_idx == 0: + print(f"\n[Sample {total_samples}]") + print(f" Current batch range: [{batch_min}, {batch_max}]") + print(f" Global range so far: [{global_min}, {global_max}]") + print(f" Codebook size (expected): {video_tokenizer.codebook_size if video_tokenizer else 'N/A'}") + if video_tokenizer: + expected_max = video_tokenizer.codebook_size - 1 + print(f" Expected max (codebook_size - 1): {expected_max}") + if global_max > expected_max: + print(f" ⚠️ WARNING: Found code {global_max} > expected max {expected_max}!") + if global_min < 0: + print(f" ⚠️ WARNING: Found code {global_min} < 0!") + + # Print unique values count for current batch + unique_values = torch.unique(video_codes).tolist() + print(f" Unique values in batch: {len(unique_values)}") + if len(unique_values) <= 20: + print(f" Values: {sorted(unique_values)}") + else: + print(f" Min unique: {min(unique_values)}, Max unique: {max(unique_values)}") + print("-" * 80) + + except Exception as e: + failed_samples += args.batch_size + logger.error(f"Failed to process batch {batch_idx}: {e}") + continue + + # Final summary + logger.info("=" * 80) + logger.info("FINAL STATISTICS:") + logger.info(f" Total samples processed: {total_samples}") + logger.info(f" Failed samples: {failed_samples}") + logger.info(f" Global min code: {global_min}") + logger.info(f" Global max code: {global_max}") + logger.info(f" Code range: [{global_min}, {global_max}]") + + if video_tokenizer: + expected_max = video_tokenizer.codebook_size - 1 + logger.info(f" Expected max (codebook_size - 1): {expected_max}") + logger.info(f" Codebook size: {video_tokenizer.codebook_size}") + logger.info(f" Mask token ID: {video_tokenizer.mask_token_id}") + + if global_max > expected_max: + logger.warning(f" ⚠️ WARNING: Found code {global_max} > expected max {expected_max}!") + elif global_max == expected_max: + logger.info(f" ✓ Max code matches expected max") + else: + logger.info(f" Note: Max code {global_max} < expected max {expected_max} (some codes may not be used)") + + if global_min < 0: + logger.warning(f" ⚠️ WARNING: Found code {global_min} < 0!") + elif global_min == 0: + logger.info(f" ✓ Min code is 0 (as expected)") + else: + logger.info(f" Note: Min code {global_min} > 0 (some codes may not be used)") + + logger.info("=" * 80) + + +if __name__ == "__main__": + main() + diff --git a/Meissonic/train/dataset_utils.py b/Meissonic/train/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1ef0e7f8497448b40c27bc35f2a3aaaa5647e939 --- /dev/null +++ b/Meissonic/train/dataset_utils.py @@ -0,0 +1,1036 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +from torch.utils.data import Dataset +from torchvision import transforms +from PIL.ImageOps import exif_transpose +from PIL import Image +import io +import pyarrow.parquet as pq +import random +import bisect +import pyarrow.fs as fs +import csv +import numpy as np +import logging + +logger = logging.getLogger(__name__) + +@torch.no_grad() +def tokenize_prompt(tokenizer, prompt, text_encoder_architecture='open_clip'): # support open_clip, CLIP, T5/UMT5 + if text_encoder_architecture == 'CLIP' or text_encoder_architecture == 'open_clip': + return tokenizer( + prompt, + truncation=True, + padding="max_length", + max_length=77, + return_tensors="pt", + ).input_ids + elif text_encoder_architecture in ['umt5-base', 'umt5-xxl', 't5']: + # T5/UMT5 tokenizer + return tokenizer( + prompt, + truncation=True, + padding="max_length", + max_length=512, + return_tensors="pt", + ).input_ids + elif text_encoder_architecture == 'CLIP_T5_base': # we have two tokenizers, 1st for CLIP, 2nd for T5 + input_ids = [] + input_ids.append(tokenizer[0]( + prompt, + truncation=True, + padding="max_length", + max_length=77, + return_tensors="pt", + ).input_ids) + input_ids.append(tokenizer[1]( + prompt, + truncation=True, + padding="max_length", + max_length=512, + return_tensors="pt", + ).input_ids) + return input_ids + else: + raise ValueError(f"Unknown text_encoder_architecture: {text_encoder_architecture}") + +def encode_prompt(text_encoder, input_ids, text_encoder_architecture='open_clip'): # support open_clip, CLIP, T5/UMT5 + if text_encoder_architecture == 'CLIP' or text_encoder_architecture == 'open_clip': + outputs = text_encoder(input_ids=input_ids, return_dict=True, output_hidden_states=True) + encoder_hidden_states = outputs.hidden_states[-2] + cond_embeds = outputs[0] + return encoder_hidden_states, cond_embeds + elif text_encoder_architecture in ['umt5-base', 'umt5-xxl', 't5']: + # T5/UMT5 encoder - only returns encoder_hidden_states, no pooled projection + outputs = text_encoder(input_ids=input_ids, return_dict=True) + encoder_hidden_states = outputs.last_hidden_state + # For T5, we don't have a pooled projection, so return None or a dummy tensor + # The video pipeline doesn't use cond_embeds, so we can return None + cond_embeds = None + return encoder_hidden_states, cond_embeds + elif text_encoder_architecture == 'CLIP_T5_base': + outputs_clip = text_encoder[0](input_ids=input_ids[0], return_dict=True, output_hidden_states=True) + outputs_t5 = text_encoder[1](input_ids=input_ids[1], decoder_input_ids=torch.zeros_like(input_ids[1]), + return_dict=True, output_hidden_states=True) + encoder_hidden_states = outputs_t5.encoder_hidden_states[-2] + cond_embeds = outputs_clip[0] + return encoder_hidden_states, cond_embeds + else: + raise ValueError(f"Unknown text_encoder_architecture: {text_encoder_architecture}") + + +def process_image(image, size, Norm=False, hps_score = 6.0): + image = exif_transpose(image) + + if not image.mode == "RGB": + image = image.convert("RGB") + + orig_height = image.height + orig_width = image.width + + image = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)(image) + + c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(size, size)) + image = transforms.functional.crop(image, c_top, c_left, size, size) + image = transforms.ToTensor()(image) + + if Norm: + image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image) + + micro_conds = torch.tensor( + [orig_width, orig_height, c_top, c_left, hps_score], + ) + + return {"image": image, "micro_conds": micro_conds} + + +class MyParquetDataset(Dataset): + def __init__(self, root_dir, tokenizer=None, size=512, + text_encoder_architecture='CLIP', norm=False): + random.seed(23) + + self.root_dir = root_dir + self.dataset_receipt = {'MSCOCO_part1': {'total_num': 6212, 'ratio':1}, 'MSCOCO_part2': {'total_num': 6212, 'ratio':1}} + + self.tokenizer = tokenizer + self.size = size + self.text_encoder_architecture = text_encoder_architecture + self.norm = norm + + self.hdfs = fs.HadoopFileSystem(host="", port=0000) # TODO: change to your own HDFS host and port + self._init_mixed_parquet_dir_list() + + self.file_metadata = [] + self.cumulative_sizes = [0] + total = 0 + for path in self.parquet_files: + try: + with pq.ParquetFile(path, filesystem=self.hdfs) as pf: + num_rows = pf.metadata.num_rows + self.file_metadata.append({ + 'path': path, + 'num_rows': num_rows, + 'global_offset': total + }) + total += num_rows + self.cumulative_sizes.append(total) + except Exception as e: + print(f"Error processing {path}: {str(e)}") + continue + + # init cache + self.current_file = None + self.cached_data = None + self.cached_file_index = -1 + + def _init_mixed_parquet_dir_list(self): + print('Loading parquet files, please be patient...') + self.parquet_files = [] + + for key, value in self.dataset_receipt.items(): + # Generate a list of standard Parquet file paths, lazy load + hdfs_path = os.path.join(self.root_dir, key) + + num = value['total_num'] + sampled_list = random.sample( + [f"{hdfs_path}/train-{idx:05d}-of-{num:05d}.parquet" for idx in range(num)], + k=int(num * value['ratio']) + ) + self.parquet_files += sampled_list + + def __len__(self): + return self.cumulative_sizes[-1] + + def _locate_file(self, global_idx): + # Use binary search to quickly locate files + file_index = bisect.bisect_right(self.cumulative_sizes, global_idx) - 1 + if file_index < 0 or file_index >= len(self.file_metadata): + raise IndexError(f"Index {global_idx} out of range") + + file_info = self.file_metadata[file_index] + local_idx = global_idx - file_info['global_offset'] + return file_index, local_idx + + def _load_file(self, file_index): + """Load Parquet files into cache on demand""" + if self.cached_file_index != file_index: + file_info = self.file_metadata[file_index] + try: + table = pq.read_table(file_info['path'], filesystem=self.hdfs) + self.cached_data = table.to_pydict() + self.cached_file_index = file_index + except Exception as e: + print(f"Error loading {file_info['path']}: {str(e)}") + raise + + def __getitem__(self, idx): + file_index, local_idx = self._locate_file(idx) + self._load_file(file_index) + sample = {k: v[local_idx] for k, v in self.cached_data.items()} + + # cprint(sample.keys(), 'red') + generated_caption, image_path = sample['task2'], sample['image'] # only suitable for my data + instance_image = Image.open(io.BytesIO(image_path['bytes'])) + + # if instance_image.width < self.size or instance_image.height < self.size: + # raise ValueError(f"Image at {image_path} is too small") + + rv = process_image(instance_image, self.size, self.norm) + + if isinstance(self.tokenizer, list): + _tmp_ = tokenize_prompt(self.tokenizer, generated_caption, self.text_encoder_architecture) + rv["prompt_input_ids"] = [_tmp_[0][0], _tmp_[1][0]] + else: + rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, generated_caption, self.text_encoder_architecture)[ + 0] + + return rv + +class HuggingFaceDataset(Dataset): + def __init__( + self, + hf_dataset, + tokenizer, + image_key, + prompt_key, + prompt_prefix=None, + size=512, + text_encoder_architecture='CLIP', + ): + self.size = size + self.image_key = image_key + self.prompt_key = prompt_key + self.tokenizer = tokenizer + self.hf_dataset = hf_dataset + self.prompt_prefix = prompt_prefix + self.text_encoder_architecture = text_encoder_architecture + + def __len__(self): + return len(self.hf_dataset) + + def __getitem__(self, index): + item = self.hf_dataset[index] + + rv = process_image(item[self.image_key], self.size) + + prompt = item[self.prompt_key] + + if self.prompt_prefix is not None: + prompt = self.prompt_prefix + prompt + + if isinstance(self.tokenizer, list): + _tmp_ = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture) + rv["prompt_input_ids"] = [_tmp_[0][0],_tmp_[1][0]] + else: + rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)[0] + + return rv + + +def process_video(video_tensor, num_frames, height, width, use_random_crop=True): + """ + Process video tensor for training. + + Uses aspect-ratio preserving resize + crop to avoid distortion. + + Args: + video_tensor: Video tensor of shape [C, F, H, W] or [F, H, W, C] + num_frames: Target number of frames + height: Target height + width: Target width + use_random_crop: If True, use random crop (for training). If False, use center crop (for validation/feature extraction) + + Returns: + Processed video tensor of shape [C, F, H, W] in [0, 1] range + """ + # Ensure video is in [C, F, H, W] format + if video_tensor.dim() == 4: + if video_tensor.shape[0] == 3 or video_tensor.shape[0] == 1: + # Already in [C, F, H, W] format + pass + elif video_tensor.shape[-1] == 3 or video_tensor.shape[-1] == 1: + # [F, H, W, C] -> [C, F, H, W] + video_tensor = video_tensor.permute(3, 0, 1, 2) + else: + raise ValueError(f"Unexpected video tensor shape: {video_tensor.shape}") + + # Normalize to [0, 1] if needed + if video_tensor.max() > 1.0: + video_tensor = video_tensor / 255.0 + + C, F, H, W = video_tensor.shape + + # Temporal resampling: ensure exactly num_frames frames + if F != num_frames: + if F < num_frames: + # If video is shorter, pad by repeating the last frame + num_pad = num_frames - F + last_frame = video_tensor[:, -1:, :, :] # [C, 1, H, W] + padding = last_frame.repeat(1, num_pad, 1, 1) # [C, num_pad, H, W] + video_tensor = torch.cat([video_tensor, padding], dim=1) # [C, num_frames, H, W] + F = num_frames + else: + # If video is longer, randomly select a continuous segment of num_frames + max_start = F - num_frames + start_idx = random.randint(0, max_start) + indices = torch.arange(start_idx, start_idx + num_frames) + video_tensor = video_tensor[:, indices, :, :] + F = num_frames # Update F after temporal resampling + + # Spatial resizing: aspect-ratio preserving resize + crop + if H != height or W != width: + # Step 1: Aspect-ratio preserving resize + # Calculate scale factors for both dimensions + scale_h = height / H + scale_w = width / W + + # Use the larger scale to ensure both dimensions are at least as large as target + # This way, after resize, we can crop to exact target size + scale = max(scale_h, scale_w) + + # Calculate new dimensions maintaining aspect ratio + new_H = int(H * scale) + new_W = int(W * scale) + + # Ensure we have at least the target size (handle rounding) + if new_H < height: + new_H = height + if new_W < width: + new_W = width + + # Resize maintaining aspect ratio + # Process each frame: [C, F, H, W] -> reshape to [C*F, 1, H, W] for interpolation + video_tensor = torch.nn.functional.interpolate( + video_tensor.reshape(C * F, 1, H, W), + size=(new_H, new_W), + mode='bilinear', + align_corners=False + ).reshape(C, F, new_H, new_W) + + # Step 2: Crop to target size (height, width) + # Calculate crop coordinates + if use_random_crop: + # Random crop for training (data augmentation) + max_h = new_H - height + max_w = new_W - width + if max_h < 0 or max_w < 0: + # If resized image is smaller than target, pad instead + pad_h = max(0, height - new_H) + pad_w = max(0, width - new_W) + video_tensor = torch.nn.functional.pad( + video_tensor, + (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2), + mode='constant', + value=0 + ) + # If still not exact size, crop or pad + if video_tensor.shape[2] != height or video_tensor.shape[3] != width: + video_tensor = torch.nn.functional.interpolate( + video_tensor.reshape(C * F, 1, video_tensor.shape[2], video_tensor.shape[3]), + size=(height, width), + mode='bilinear', + align_corners=False + ).reshape(C, F, height, width) + else: + crop_h = random.randint(0, max_h) + crop_w = random.randint(0, max_w) + video_tensor = video_tensor[:, :, crop_h:crop_h + height, crop_w:crop_w + width] + else: + # Center crop for validation/feature extraction (deterministic) + crop_h = (new_H - height) // 2 + crop_w = (new_W - width) // 2 + if crop_h < 0 or crop_w < 0: + # If resized image is smaller than target, pad instead + pad_h = max(0, height - new_H) + pad_w = max(0, width - new_W) + video_tensor = torch.nn.functional.pad( + video_tensor, + (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2), + mode='constant', + value=0 + ) + # If still not exact size, crop or pad + if video_tensor.shape[2] != height or video_tensor.shape[3] != width: + video_tensor = torch.nn.functional.interpolate( + video_tensor.reshape(C * F, 1, video_tensor.shape[2], video_tensor.shape[3]), + size=(height, width), + mode='bilinear', + align_corners=False + ).reshape(C, F, height, width) + else: + video_tensor = video_tensor[:, :, crop_h:crop_h + height, crop_w:crop_w + width] + + # Final verification: ensure output has exactly the expected shape + C, F, H, W = video_tensor.shape + assert F == num_frames, f"Frame count mismatch: expected {num_frames}, got {F}" + assert H == height, f"Height mismatch: expected {height}, got {H}" + assert W == width, f"Width mismatch: expected {width}, got {W}" + + return video_tensor + + +class VideoDataset(Dataset): + """ + Dataset for video training, compatible with HuggingFace datasets format. + Supports OpenVid1M and similar video-text datasets. + """ + def __init__( + self, + hf_dataset, + tokenizer, + video_key="video", + prompt_key="caption", + prompt_prefix=None, + num_frames=16, + height=480, + width=848, + text_encoder_architecture='umt5-base', + use_random_crop=True, # Random crop for training, center crop for validation + ): + self.hf_dataset = hf_dataset + self.tokenizer = tokenizer + self.video_key = video_key + self.prompt_key = prompt_key + self.prompt_prefix = prompt_prefix + self.num_frames = num_frames + self.height = height + self.width = width + self.text_encoder_architecture = text_encoder_architecture + self.use_random_crop = use_random_crop + + def __len__(self): + return len(self.hf_dataset) + + def __getitem__(self, index): + item = self.hf_dataset[index] + + # Load video + video = item[self.video_key] + + # Convert to tensor if needed (handle different formats) + if isinstance(video, list): + # List of PIL Images or tensors + frames = [] + for frame in video: + if isinstance(frame, Image.Image): + frame = transforms.ToTensor()(frame) + frames.append(frame) + video_tensor = torch.stack(frames, dim=1) # [C, F, H, W] + elif isinstance(video, torch.Tensor): + video_tensor = video + else: + raise ValueError(f"Unsupported video type: {type(video)}") + + # Process video + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) + + # Ensure video tensor has exactly the expected shape + C, F, H, W = video_tensor.shape + if F != self.num_frames or H != self.height or W != self.width: + # If shape doesn't match, create a properly sized tensor + video_tensor = torch.nn.functional.interpolate( + video_tensor.reshape(C * F, 1, H, W), + size=(self.height, self.width), + mode='bilinear', + align_corners=False + ).reshape(C, F, self.height, self.width) + # Ensure exactly num_frames + if F < self.num_frames: + # Pad by repeating last frame + num_pad = self.num_frames - F + last_frame = video_tensor[:, -1:, :, :] + padding = last_frame.repeat(1, num_pad, 1, 1) + video_tensor = torch.cat([video_tensor, padding], dim=1) + elif F > self.num_frames: + # Crop to num_frames + video_tensor = video_tensor[:, :self.num_frames, :, :] + + # Clone to ensure storage is resizable (required for DataLoader collate) + video_tensor = video_tensor.contiguous().clone() + + # Process prompt + prompt = item[self.prompt_key] + if self.prompt_prefix is not None: + prompt = self.prompt_prefix + prompt + + prompt_input_ids = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)[0] + # Clone to ensure storage is resizable + prompt_input_ids = prompt_input_ids.clone() + + rv = { + "video": video_tensor, # [C, num_frames, height, width], guaranteed shape + "prompt_input_ids": prompt_input_ids + } + + return rv + + +class OpenVid1MDataset(Dataset): + """ + Dataset for OpenVid1M video-text pairs from CSV file. + + CSV format: + video,caption,aesthetic score,motion score,temporal consistency score,camera motion,frame,fps,seconds,new_id + + Returns: + dict with keys: + - "video": torch.Tensor of shape [C, F, H, W] in [0, 1] range + - "prompt_input_ids": torch.Tensor of tokenized prompt + """ + def __init__( + self, + csv_path, + video_root_dir, + tokenizer, + num_frames=16, + height=480, + width=848, + text_encoder_architecture='umt5-base', + prompt_prefix=None, + use_random_temporal_crop=True, # If False, always sample from the beginning + use_random_crop=True, # Random crop for training, center crop for validation/feature extraction + ): + """ + Args: + csv_path: Path to the CSV file containing video metadata + video_root_dir: Root directory where video files are stored + tokenizer: Text tokenizer + num_frames: Target number of frames to extract + height: Target height + width: Target width + text_encoder_architecture: Architecture of text encoder + prompt_prefix: Optional prefix to add to prompts + """ + self.csv_path = csv_path + self.video_root_dir = video_root_dir + self.tokenizer = tokenizer + self.num_frames = num_frames + self.height = height + self.width = width + self.text_encoder_architecture = text_encoder_architecture + self.prompt_prefix = prompt_prefix + self.use_random_temporal_crop = use_random_temporal_crop + self.use_random_crop = use_random_crop + + # Load CSV data + self.data = [] + with open(csv_path, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + self.data.append(row) + + logger.info(f"Loaded {len(self.data)} video entries from {csv_path}") + + # Try to import video loading library + self.video_loader = None + try: + import decord + decord.bridge.set_bridge('torch') + self.video_loader = 'decord' + logger.info("Using decord for video loading") + except ImportError: + try: + import av + self.video_loader = 'av' + logger.info("Using PyAV for video loading") + except ImportError: + try: + import cv2 + self.video_loader = 'cv2' + logger.info("Using OpenCV for video loading") + except ImportError: + raise ImportError( + "No video loading library found. Please install one of: " + "decord (pip install decord), PyAV (pip install av), or opencv-python (pip install opencv-python)" + ) + + def __len__(self): + return len(self.data) + + def _load_video_decord(self, video_path): + """Load video using decord""" + import decord + vr = decord.VideoReader(video_path, ctx=decord.cpu(0)) + total_frames = len(vr) + + # Sample frames: random temporal crop (continuous segment) for better temporal coherence + if total_frames <= self.num_frames: + indices = list(range(total_frames)) + else: + if self.use_random_temporal_crop: + # Randomly select a continuous segment of num_frames + max_start = total_frames - self.num_frames + start_idx = random.randint(0, max_start) + else: + # Fixed sampling: always start from the beginning + start_idx = 0 + indices = list(range(start_idx, start_idx + self.num_frames)) + + frames = vr.get_batch(indices) # [F, H, W, C] in uint8 + # If using torch bridge, frames is already a torch Tensor + if isinstance(frames, torch.Tensor): + frames = frames.float() # [F, H, W, C] + else: + # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy + # This avoids "Trying to resize storage that is not resizable" errors in DataLoader collate + frames = torch.tensor(frames, dtype=torch.float32) # [F, H, W, C], fully copied + frames = frames.permute(3, 0, 1, 2) # [C, F, H, W] + frames = frames / 255.0 # Normalize to [0, 1] + + return frames + + def _load_video_av(self, video_path): + """Load video using PyAV""" + import av + container = av.open(video_path) + frames = [] + + # Get video stream + video_stream = container.streams.video[0] + total_frames = video_stream.frames if video_stream.frames > 0 else None + + # Sample frames: random temporal crop (continuous segment) for better temporal coherence + if total_frames is None: + # If we can't get frame count, decode all frames and sample + frame_list = [] + for frame in container.decode(video_stream): + frame_list.append(frame) + total_frames = len(frame_list) + if total_frames <= self.num_frames: + frame_indices = list(range(total_frames)) + else: + if self.use_random_temporal_crop: + # Randomly select a continuous segment of num_frames + max_start = total_frames - self.num_frames + start_idx = random.randint(0, max_start) + else: + # Fixed sampling: always start from the beginning + start_idx = 0 + frame_indices = list(range(start_idx, start_idx + self.num_frames)) + frames = [transforms.ToTensor()(frame_list[i].to_image()) for i in frame_indices] + else: + if total_frames <= self.num_frames: + frame_indices = list(range(total_frames)) + else: + if self.use_random_temporal_crop: + # Randomly select a continuous segment of num_frames + max_start = total_frames - self.num_frames + start_idx = random.randint(0, max_start) + else: + # Fixed sampling: always start from the beginning + start_idx = 0 + frame_indices = list(range(start_idx, start_idx + self.num_frames)) + + frame_idx = 0 + for frame in container.decode(video_stream): + if frame_idx in frame_indices: + img = frame.to_image() # PIL Image + img_tensor = transforms.ToTensor()(img) # [C, H, W] + frames.append(img_tensor) + if len(frames) >= self.num_frames: + break + frame_idx += 1 + + container.close() + + if len(frames) == 0: + raise ValueError(f"No frames extracted from {video_path}") + + # Stack frames: [C, F, H, W] + video_tensor = torch.stack(frames, dim=1) + + # Pad if needed + if video_tensor.shape[1] < self.num_frames: + padding = torch.zeros( + video_tensor.shape[0], + self.num_frames - video_tensor.shape[1], + video_tensor.shape[2], + video_tensor.shape[3] + ) + video_tensor = torch.cat([video_tensor, padding], dim=1) + + return video_tensor + + def _load_video_cv2(self, video_path): + """Load video using OpenCV""" + import cv2 + cap = cv2.VideoCapture(video_path) + frames = [] + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + # Sample frames: random temporal crop (continuous segment) for better temporal coherence + if total_frames <= self.num_frames: + frame_indices = list(range(total_frames)) + else: + if self.use_random_temporal_crop: + # Randomly select a continuous segment of num_frames + max_start = total_frames - self.num_frames + start_idx = random.randint(0, max_start) + else: + # Fixed sampling: always start from the beginning + start_idx = 0 + frame_indices = list(range(start_idx, start_idx + self.num_frames)) + + frame_idx = 0 + while True: + ret, frame = cap.read() + if not ret: + break + if frame_idx in frame_indices: + # Convert BGR to RGB + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + # Convert to tensor [C, H, W] and normalize to [0, 1] + # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy + # This avoids "Trying to resize storage that is not resizable" errors in DataLoader collate + frame_tensor = torch.tensor(frame_rgb, dtype=torch.float32).permute(2, 0, 1) / 255.0 + frames.append(frame_tensor) + if len(frames) >= self.num_frames: + break + frame_idx += 1 + + cap.release() + + if len(frames) == 0: + raise ValueError(f"No frames extracted from {video_path}") + + # Stack frames: [C, F, H, W] + video_tensor = torch.stack(frames, dim=1) + + # Pad if needed + if video_tensor.shape[1] < self.num_frames: + padding = torch.zeros( + video_tensor.shape[0], + self.num_frames - video_tensor.shape[1], + video_tensor.shape[2], + video_tensor.shape[3] + ) + video_tensor = torch.cat([video_tensor, padding], dim=1) + + return video_tensor + + def _load_video(self, video_path): + """Load video from path using the available video loader""" + full_path = os.path.join(self.video_root_dir, video_path) + + if not os.path.exists(full_path): + raise FileNotFoundError(f"Video file not found: {full_path}") + + if self.video_loader == 'decord': + return self._load_video_decord(full_path) + elif self.video_loader == 'av': + return self._load_video_av(full_path) + elif self.video_loader == 'cv2': + return self._load_video_cv2(full_path) + else: + raise ValueError(f"Unknown video loader: {self.video_loader}") + + def __getitem__(self, index): + row = self.data[index] + + # Load video + video_path = row['video'] + try: + video_tensor = self._load_video(video_path) + except Exception as e: + # If video loading fails, return a zero tensor and log error + logger.warning(f"Failed to load video {video_path}: {e}") + video_tensor = torch.zeros(3, self.num_frames, self.height, self.width) + + # Process video: aspect-ratio preserving resize + crop to target dimensions + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width, use_random_crop=self.use_random_crop) + + # Ensure video tensor has exactly the expected shape + C, F, H, W = video_tensor.shape + if F != self.num_frames or H != self.height or W != self.width: + # If shape doesn't match, create a properly sized tensor + video_tensor = torch.nn.functional.interpolate( + video_tensor.reshape(C * F, 1, H, W), + size=(self.height, self.width), + mode='bilinear', + align_corners=False + ).reshape(C, F, self.height, self.width) + # Ensure exactly num_frames + if F < self.num_frames: + # Pad by repeating last frame + num_pad = self.num_frames - F + last_frame = video_tensor[:, -1:, :, :] + padding = last_frame.repeat(1, num_pad, 1, 1) + video_tensor = torch.cat([video_tensor, padding], dim=1) + elif F > self.num_frames: + # Crop to num_frames + video_tensor = video_tensor[:, :self.num_frames, :, :] + + # Clone to ensure storage is resizable (required for DataLoader collate) + video_tensor = video_tensor.contiguous().clone() + + # Process prompt + prompt = row['caption'] + if self.prompt_prefix is not None: + prompt = self.prompt_prefix + prompt + + prompt_input_ids = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)[0] + # Clone to ensure storage is resizable + prompt_input_ids = prompt_input_ids.clone() + + return { + "video": video_tensor, # [C, num_frames, height, width], guaranteed shape + "prompt_input_ids": prompt_input_ids + } + + +class TinyOpenVid1MDataset(OpenVid1MDataset): + """ + A tiny subset of OpenVid1MDataset for overfitting experiments. + Only takes the first N samples from the full dataset. + """ + def __init__( + self, + csv_path, + video_root_dir=None, + tokenizer=None, + num_frames=16, + height=480, + width=848, + text_encoder_architecture='umt5-base', + prompt_prefix=None, + max_samples=256, # Only use first N samples + seed=42, # Fixed seed for reproducibility + ): + """ + Args: + max_samples: Maximum number of samples to use (default: 256) + seed: Random seed for reproducibility (default: 42) + """ + # Initialize parent class + super().__init__( + csv_path=csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=num_frames, + height=height, + width=width, + text_encoder_architecture=text_encoder_architecture, + prompt_prefix=prompt_prefix, + ) + + # Limit to first max_samples + original_len = len(self.data) + if original_len > max_samples: + # Use fixed seed to ensure reproducibility + import random + random.seed(seed) + # Shuffle with fixed seed, then take first max_samples + indices = list(range(original_len)) + random.shuffle(indices) + self.data = [self.data[i] for i in indices[:max_samples]] + logger.info(f"Limited dataset to {max_samples} samples (from {original_len} total) for overfitting experiment") + else: + logger.info(f"Using all {len(self.data)} samples (less than max_samples={max_samples})") + + +def get_hierarchical_path(base_dir, index): + """ + Get hierarchical path for loading features from 3-level directory structure. + + Structure: base_dir/level1/level2/level3/filename.npy + - level1: index // 1000000 (0-999) + - level2: (index // 1000) % 1000 (0-999) + - level3: index % 1000 (0-999) + + Args: + base_dir: Base directory for features + index: Sample index + + Returns: + Full path to the file + """ + level1 = index // 1000000 + level2 = (index // 1000) % 1000 + level3 = index % 1000 + + file_path = os.path.join( + base_dir, + f"{level1:03d}", + f"{level2:03d}", + f"{level3:03d}", + f"{index:08d}.npy" + ) + + return file_path + + +class PrecomputedFeatureDataset(Dataset): + """ + Dataset for loading pre-extracted video codes and text embeddings. + + This dataset loads features that were pre-extracted by extract_features.py, + avoiding the need to encode videos and text during training. + + Features are stored in a 3-level hierarchical directory structure: + - video_codes/level1/level2/level3/index.npy + - text_embeddings/level1/level2/level3/index.npy + """ + + def __init__( + self, + features_dir, + num_samples=None, + start_index=0, + ): + """ + Args: + features_dir: Directory containing extracted features (should have video_codes/ and text_embeddings/ subdirs) + num_samples: Number of samples to use. If None, use all available samples. + start_index: Starting index for samples (for resuming or subset selection) + """ + self.features_dir = features_dir + self.video_codes_dir = os.path.join(features_dir, "video_codes") + self.text_embeddings_dir = os.path.join(features_dir, "text_embeddings") + self.metadata_file = os.path.join(features_dir, "metadata.json") + + # Load metadata + if os.path.exists(self.metadata_file): + import json + with open(self.metadata_file, 'r') as f: + self.metadata = json.load(f) + logger.info(f"Loaded metadata from {self.metadata_file}") + logger.info(f" Total samples in metadata: {self.metadata.get('num_samples', 'unknown')}") + + # Get available indices from metadata + if 'samples' in self.metadata and len(self.metadata['samples']) > 0: + available_indices = sorted([s['index'] for s in self.metadata['samples']]) + else: + # Fallback: infer from directory structure + available_indices = self._scan_hierarchical_directory(self.video_codes_dir) + else: + # If no metadata, scan directory structure + logger.warning(f"Metadata file not found: {self.metadata_file}, scanning directory structure") + self.metadata = {} + available_indices = self._scan_hierarchical_directory(self.video_codes_dir) + + # Filter by start_index and num_samples + available_indices = [idx for idx in available_indices if idx >= start_index] + if num_samples is not None: + available_indices = available_indices[:num_samples] + + self.indices = available_indices + logger.info(f"PrecomputedFeatureDataset: {len(self.indices)} samples available") + if len(self.indices) > 0: + logger.info(f" Index range: {min(self.indices)} to {max(self.indices)}") + + def _scan_hierarchical_directory(self, base_dir): + """ + Scan hierarchical directory structure to find all available indices. + + Args: + base_dir: Base directory to scan + + Returns: + List of available indices + """ + available_indices = [] + + if not os.path.exists(base_dir): + raise FileNotFoundError(f"Directory not found: {base_dir}") + + # Scan level1 directories (000-999) + for level1 in range(1000): + level1_dir = os.path.join(base_dir, f"{level1:03d}") + if not os.path.exists(level1_dir): + continue + + # Scan level2 directories (000-999) + for level2 in range(1000): + level2_dir = os.path.join(level1_dir, f"{level2:03d}") + if not os.path.exists(level2_dir): + continue + + # Scan level3 directories (000-999) + for level3 in range(1000): + level3_dir = os.path.join(level2_dir, f"{level3:03d}") + if not os.path.exists(level3_dir): + continue + + # List all .npy files in level3 directory + for filename in os.listdir(level3_dir): + if filename.endswith('.npy'): + try: + index = int(filename.replace('.npy', '')) + available_indices.append(index) + except ValueError: + continue + + return sorted(available_indices) + + def __len__(self): + return len(self.indices) + + def __getitem__(self, idx): + sample_idx = self.indices[idx] + + # Get hierarchical paths + video_code_path = get_hierarchical_path(self.video_codes_dir, sample_idx) + text_embedding_path = get_hierarchical_path(self.text_embeddings_dir, sample_idx) + + # Load video codes + # Note: We load directly (not mmap) to avoid storage sharing issues with torch + # The files are small enough (video codes are int32, typically < 1MB per sample) + if not os.path.exists(video_code_path): + raise FileNotFoundError(f"Video code not found: {video_code_path}") + video_codes_np = np.load(video_code_path) # [F', H', W'] + # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy + # This avoids "Trying to resize storage that is not resizable" errors in DataLoader collate + video_codes = torch.tensor(video_codes_np, dtype=torch.int32) # CPU tensor, int32, fully copied + del video_codes_np # Release numpy array reference + + # Load text embedding + # Note: We load directly (not mmap) to avoid storage sharing issues with torch + if not os.path.exists(text_embedding_path): + raise FileNotFoundError(f"Text embedding not found: {text_embedding_path}") + text_embedding_np = np.load(text_embedding_path) # [L, D] + # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy + # Preserve original dtype (should be float16 from extraction) + text_embedding_dtype = torch.float16 if text_embedding_np.dtype == np.float16 else torch.float32 + text_embedding = torch.tensor(text_embedding_np, dtype=text_embedding_dtype) # CPU tensor, fully copied + del text_embedding_np # Release numpy array reference + + return { + "video_codes": video_codes, # [F', H', W'], CPU tensor, int32 + "text_embedding": text_embedding, # [L, D], CPU tensor, float16/bfloat16 + "sample_index": sample_idx, + } \ No newline at end of file diff --git a/Meissonic/train/extract.sh b/Meissonic/train/extract.sh new file mode 100644 index 0000000000000000000000000000000000000000..6d6a95d40bcdbe4ea3449ebede80cbd105528f77 --- /dev/null +++ b/Meissonic/train/extract.sh @@ -0,0 +1,45 @@ + +accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \ + train/extract_features.py \ + --csv_path /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv \ + --output_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128 \ + --text_encoder_architecture umt5-xxl \ + --video_tokenizer_model_id Cosmos-0.1-Tokenizer-DV4x8x8 \ + --num_frames 17 \ + --video_height 128 \ + --video_width 128 \ + --batch_size 64 \ + --num_workers 8 \ + --extract_text + # --extract_video + + +# python train/extract_empty_embeds.py \ +# --text_encoder_architecture umt5-base \ +# --output_path /path/to/empty_embeds.pt \ +# --dtype float16 + + +# python train/train_mei_video.py \ +# --use_precomputed_features \ +# --features_dir /path/to/extracted_features \ +# --text_encoder_architecture umt5-base \ +# --video_tokenizer_model_id Cosmos-1.0-Tokenizer-DV8x16x16 \ +# --num_frames 16 \ +# --video_height 480 \ +# --video_width 848 \ +# --train_batch_size 8 \ +# --learning_rate 3e-4 \ +# --max_train_steps 10000 \ +# --output_dir ./output \ +# --mixed_precision bf16 + + +# python train/check_codebook_range.py \ +# --csv_path /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv \ +# --video_tokenizer_model_id Cosmos-0.1-Tokenizer-DV4x8x8 \ +# --num_frames 16 \ +# --video_height 480 \ +# --video_width 848 \ +# --check_interval 10 \ +# --max_samples 1000 # 可选:限制检查的样本数 \ No newline at end of file diff --git a/Meissonic/train/extract_empty_embeds.py b/Meissonic/train/extract_empty_embeds.py new file mode 100644 index 0000000000000000000000000000000000000000..b28c7daec8f506d57ec0af32c64cf3d75dce137b --- /dev/null +++ b/Meissonic/train/extract_empty_embeds.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Extract and save empty_embeds for conditional dropout. + +This script extracts the empty embedding (from empty string prompt) +and saves it to a file that can be loaded during training with precomputed features. +""" + +import argparse +import os +import json +import torch +from transformers import T5EncoderModel, T5Tokenizer +from dataset_utils import tokenize_prompt, encode_prompt +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Extract empty embeddings for conditional dropout") + + parser.add_argument( + "--text_encoder_architecture", + type=str, + default="umt5-base", + choices=["umt5-base", "umt5-xxl", "t5"], + help="Text encoder architecture", + ) + parser.add_argument( + "--output_path", + type=str, + required=True, + help="Path to save the empty_embeds (will save as .pt file and metadata as .json)", + ) + parser.add_argument( + "--device", + type=str, + default="cuda" if torch.cuda.is_available() else "cpu", + help="Device to use for encoding", + ) + parser.add_argument( + "--dtype", + type=str, + default="float16", + choices=["float16", "bfloat16", "float32"], + help="Data type for saving embeddings", + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + # Map architecture to model ID + if args.text_encoder_architecture == "umt5-base": + model_id = "google/umt5-base" + elif args.text_encoder_architecture == "umt5-xxl": + model_id = "google/umt5-xxl" + elif args.text_encoder_architecture == "t5": + model_id = "t5-base" + else: + raise ValueError(f"Unknown text encoder architecture: {args.text_encoder_architecture}") + + # Map dtype + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + dtype = dtype_map[args.dtype] + + logger.info(f"Loading text encoder: {model_id}") + logger.info(f"Device: {args.device}, Dtype: {args.dtype}") + + # Load text encoder and tokenizer + text_encoder = T5EncoderModel.from_pretrained(model_id) + tokenizer = T5Tokenizer.from_pretrained(model_id) + + # Move to device and set dtype + text_encoder.to(device=args.device, dtype=dtype) + text_encoder.eval() + text_encoder.requires_grad_(False) + + # Extract empty embedding + logger.info("Extracting empty embedding from empty string...") + with torch.no_grad(): + empty_input_ids = tokenize_prompt(tokenizer, "", args.text_encoder_architecture) + empty_input_ids = empty_input_ids.to(args.device) + + empty_embeds, cond_embeds = encode_prompt( + text_encoder, + empty_input_ids, + args.text_encoder_architecture + ) + + # Convert to CPU and target dtype + empty_embeds = empty_embeds.cpu().to(dtype) + + logger.info(f"Empty embedding shape: {empty_embeds.shape}") + logger.info(f"Empty embedding dtype: {empty_embeds.dtype}") + + # Save empty_embeds + output_dir = os.path.dirname(args.output_path) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + # Save as .pt file + torch.save(empty_embeds, args.output_path) + logger.info(f"Saved empty_embeds to: {args.output_path}") + + # Save metadata + metadata_path = args.output_path.replace('.pt', '.json') + metadata = { + "text_encoder_architecture": args.text_encoder_architecture, + "model_id": model_id, + "empty_embeds_shape": list(empty_embeds.shape), + "empty_embeds_dtype": str(empty_embeds.dtype), + "device": args.device, + "dtype": args.dtype, + } + + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + logger.info(f"Saved metadata to: {metadata_path}") + + logger.info("Done!") + + +if __name__ == "__main__": + main() + diff --git a/Meissonic/train/extract_features.py b/Meissonic/train/extract_features.py new file mode 100644 index 0000000000000000000000000000000000000000..907d17b3c456d7ba40f37763e90eca594ce31829 --- /dev/null +++ b/Meissonic/train/extract_features.py @@ -0,0 +1,580 @@ +#!/usr/bin/env python3 +""" +Extract video codes and text embeddings from video-text pairs for efficient training. + +This script pre-extracts: +1. Video codes: Discrete tokens from CosmosVideoTokenizer +2. Text embeddings: Encoder hidden states from T5/UMT5 + +The extracted features are saved to disk and can be loaded directly during training, +avoiding repeated encoding operations. +""" + +import argparse +import os +import sys +import logging +from pathlib import Path +from tqdm import tqdm +import torch +import numpy as np +from torch.utils.data import DataLoader, DistributedSampler +import json + +sys.path.append(os.getcwd()) + +from train.dataset_utils import OpenVid1MDataset, tokenize_prompt, encode_prompt +from src.pipeline_video import CosmosVideoTokenizer +from transformers import T5Tokenizer, T5EncoderModel +from accelerate import Accelerator + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def get_hierarchical_path(base_dir, index): + """ + Get hierarchical path for storing features in 3-level directory structure. + + Structure: base_dir/level1/level2/level3/filename.npy + - level1: index // 1000000 (0-999) + - level2: (index // 1000) % 1000 (0-999) + - level3: index % 1000 (0-999) + + Args: + base_dir: Base directory for features + index: Sample index + + Returns: + Full path to the file + """ + level1 = index // 1000000 + level2 = (index // 1000) % 1000 + level3 = index % 1000 + + dir_path = os.path.join( + base_dir, + f"{level1:03d}", + f"{level2:03d}", + f"{level3:03d}" + ) + file_path = os.path.join(dir_path, f"{index:08d}.npy") + + return dir_path, file_path + + +def parse_args(): + parser = argparse.ArgumentParser(description="Extract video codes and text embeddings") + + parser.add_argument( + "--csv_path", + type=str, + required=True, + help="Path to OpenVid1M CSV file", + ) + parser.add_argument( + "--video_root_dir", + type=str, + default=None, + help="Root directory containing video files. If None, will auto-detect.", + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Output directory to save extracted features", + ) + parser.add_argument( + "--text_encoder_architecture", + type=str, + default="umt5-base", + choices=["umt5-base", "umt5-xxl", "t5"], + help="Text encoder architecture", + ) + parser.add_argument( + "--video_tokenizer_model_id", + type=str, + default="Cosmos-1.0-Tokenizer-DV8x16x16", + help="HuggingFace model ID for Cosmos video tokenizer", + ) + parser.add_argument( + "--num_frames", + type=int, + default=16, + help="Number of frames per video", + ) + parser.add_argument( + "--video_height", + type=int, + default=480, + help="Video height", + ) + parser.add_argument( + "--video_width", + type=int, + default=848, + help="Video width", + ) + parser.add_argument( + "--batch_size", + type=int, + default=4, + help="Batch size for feature extraction", + ) + parser.add_argument( + "--num_workers", + type=int, + default=4, + help="Number of dataloader workers", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples to process (for testing). If None, process all.", + ) + parser.add_argument( + "--resume_from_index", + type=int, + default=0, + help="Resume extraction from this index (for resuming interrupted extraction)", + ) + parser.add_argument( + "--prompt_prefix", + type=str, + default=None, + help="Prefix to add to prompts", + ) + parser.add_argument( + "--extract_video", + action="store_true", + default=False, + help="Extract video codes. Enable this flag to dump video codes.", + ) + parser.add_argument( + "--extract_text", + action="store_true", + default=False, + help="Extract text embeddings. Enable this flag to dump text embeddings.", + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + # Initialize accelerator + accelerator = Accelerator() + + # Log GPU info + logger.info(f"Process {accelerator.process_index}/{accelerator.num_processes} on device {accelerator.device}") + + if accelerator.is_main_process: + os.makedirs(args.output_dir, exist_ok=True) + logger.info(f"Output directory: {args.output_dir}") + logger.info(f"Using {accelerator.num_processes} GPUs for parallel extraction") + logger.info(f"Extract video codes: {args.extract_video}") + logger.info(f"Extract text embeddings: {args.extract_text}") + + # Validate that at least one feature type is selected + if not args.extract_video and not args.extract_text: + raise ValueError("At least one feature type must be enabled. Use --extract_video and/or --extract_text.") + + device = accelerator.device + dtype = torch.float32 + + # Initialize text encoder (only if needed) + text_encoder = None + tokenizer = None + if args.extract_text: + logger.info(f"Loading text encoder: {args.text_encoder_architecture}") + if args.text_encoder_architecture == "umt5-base": + model_id = "google/umt5-base" + elif args.text_encoder_architecture == "umt5-xxl": + model_id = "google/umt5-xxl" + elif args.text_encoder_architecture == "t5": + model_id = "t5-base" + else: + raise ValueError(f"Unknown text encoder: {args.text_encoder_architecture}") + + text_encoder = T5EncoderModel.from_pretrained(model_id) + tokenizer = T5Tokenizer.from_pretrained(model_id) + text_encoder.to(device=device, dtype=dtype) + text_encoder.eval() + text_encoder.requires_grad_(False) + + # Extract empty_embeds for conditional dropout (only on main process) + if accelerator.is_main_process: + logger.info("Extracting empty_embeds for conditional dropout...") + with torch.no_grad(): + empty_input_ids = tokenize_prompt(tokenizer, "", args.text_encoder_architecture) + empty_input_ids = empty_input_ids.to(device) + empty_embeds, _ = encode_prompt( + text_encoder, + empty_input_ids, + args.text_encoder_architecture + ) + + # Convert to CPU and save as .npy (more space-efficient than .pt) + empty_embeds_np = empty_embeds.cpu().numpy().astype(np.float16) # Save as float16 to save space + empty_embeds_path = os.path.join(args.output_dir, "empty_embeds.npy") + np.save(empty_embeds_path, empty_embeds_np) + logger.info(f"Saved empty_embeds to: {empty_embeds_path}") + logger.info(f" Shape: {empty_embeds_np.shape}, dtype: {empty_embeds_np.dtype}") + else: + logger.info("Skipping text encoder loading (--no_extract_text)") + # Still need tokenizer for dataset if video extraction needs prompts + # But if only extracting video, we might not need text processing + # For now, we'll still create a minimal tokenizer for dataset compatibility + if args.extract_video: + # Use a simple tokenizer just for dataset compatibility + tokenizer = T5Tokenizer.from_pretrained("google/umt5-base") + + # Initialize video tokenizer (only if needed) + video_tokenizer = None + if args.extract_video: + logger.info(f"Loading video tokenizer: {args.video_tokenizer_model_id}") + video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=device, + dtype=dtype + ) + video_tokenizer.requires_grad_(False) + video_tokenizer.eval() + else: + logger.info("Skipping video tokenizer loading (--no_extract_video)") + + # Auto-detect video_root_dir if not provided + if args.video_root_dir is None: + csv_dir = os.path.dirname(args.csv_path) + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + video_root_dir = csv_dir + logger.warning(f"Video directory not found, using CSV directory: {video_root_dir}") + else: + video_root_dir = args.video_root_dir + + # Create dataset + # For feature extraction, use fixed temporal crop (from beginning) and center crop for consistency + # Training will use random temporal crop and random crop for data augmentation + dataset = OpenVid1MDataset( + csv_path=args.csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture, + prompt_prefix=args.prompt_prefix, + use_random_temporal_crop=False, # Fixed sampling for feature extraction consistency + use_random_crop=False, # Center crop for feature extraction consistency + ) + + # Limit dataset size if specified + if args.max_samples is not None: + dataset.data = dataset.data[:args.max_samples] + logger.info(f"Limited dataset to {len(dataset)} samples") + + # Calculate dataset size and per-process info + num_processes = accelerator.num_processes + process_index = accelerator.process_index + + # Resume from index + if args.resume_from_index > 0: + dataset.data = dataset.data[args.resume_from_index:] + logger.info(f"Resuming from index {args.resume_from_index}, remaining samples: {len(dataset)}") + + # Create DistributedSampler for proper data sharding across GPUs + sampler = DistributedSampler( + dataset, + num_replicas=num_processes, + rank=process_index, + shuffle=False, # Don't shuffle for feature extraction + drop_last=False, + ) + + # Get sampler indices before prepare (they won't be accessible after) + sampler_indices = list(sampler) # This gives us the global dataset indices for this process + + # Create dataloader + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + sampler=sampler, # Use DistributedSampler + num_workers=args.num_workers, + pin_memory=True, + ) + + dataloader = accelerator.prepare(dataloader) + + # Create output directories (only for features that will be extracted) + video_codes_dir = None + text_embeddings_dir = None + if args.extract_video: + video_codes_dir = os.path.join(args.output_dir, "video_codes") + os.makedirs(video_codes_dir, exist_ok=True) + if args.extract_text: + text_embeddings_dir = os.path.join(args.output_dir, "text_embeddings") + os.makedirs(text_embeddings_dir, exist_ok=True) + + metadata_file = os.path.join(args.output_dir, "metadata.json") + + # Calculate dataset size and per-process info + total_samples = len(dataset) + + # Extract features + logger.info(f"[GPU {process_index}] Starting feature extraction for {total_samples} samples (process {process_index+1}/{num_processes})...") + logger.info(f"[GPU {process_index}] This process will handle ~{len(dataloader) * args.batch_size} samples") + + # Get codebook_size and mask_token_id from video_tokenizer (if extracting video) + codebook_size = None + mask_token_id = None + if args.extract_video and video_tokenizer is not None: + codebook_size = video_tokenizer.codebook_size + mask_token_id = video_tokenizer.mask_token_id + logger.info(f"[GPU {process_index}] Video tokenizer info: codebook_size={codebook_size}, mask_token_id={mask_token_id}") + + # Get empty_embeds info (only on main process, will be added to metadata later) + empty_embeds_shape = None + empty_embeds_path = os.path.join(args.output_dir, "empty_embeds.npy") + if args.extract_text and accelerator.is_main_process and os.path.exists(empty_embeds_path): + empty_embeds_np = np.load(empty_embeds_path) + empty_embeds_shape = list(empty_embeds_np.shape) + logger.info(f"Empty embeds shape: {empty_embeds_shape}") + + # Per-process metadata (will be merged at the end) + process_metadata = { + "process_index": process_index, + "num_samples": total_samples, + "extract_video": args.extract_video, + "extract_text": args.extract_text, + "text_encoder_architecture": args.text_encoder_architecture if args.extract_text else None, + "video_tokenizer_model_id": args.video_tokenizer_model_id if args.extract_video else None, + "codebook_size": codebook_size, + "mask_token_id": mask_token_id, + "num_frames": args.num_frames, + "video_height": args.video_height, + "video_width": args.video_width, + "prompt_prefix": args.prompt_prefix, + "empty_embeds_shape": empty_embeds_shape if process_index == 0 else None, # Only main process has this + "empty_embeds_path": "empty_embeds.npy" if args.extract_text else None, + "samples": [] + } + + # Track processed samples for this process + process_failed_samples = [] + process_samples_processed = 0 + + with torch.no_grad(): + for batch_idx, batch in enumerate(tqdm(dataloader, desc=f"[GPU {process_index}] Extracting", disable=not accelerator.is_main_process)): + batch_size = batch["video"].shape[0] if args.extract_video else batch["prompt_input_ids"].shape[0] + + # Extract video codes (if needed) + video_codes = None + if args.extract_video: + videos = batch["video"].to(device, non_blocking=True) # [B, C, F, H, W] + try: + video_codes = video_tokenizer.encode(videos) # [B, F', H', W'] + video_codes = video_codes.cpu().numpy() # Convert to numpy for saving + except Exception as e: + logger.error(f"[GPU {process_index}] Failed to encode video batch {batch_idx}: {e}") + continue + + # Extract text embeddings (if needed) + encoder_hidden_states = None + if args.extract_text: + prompt_input_ids = batch["prompt_input_ids"].to(device, non_blocking=True) + try: + encoder_hidden_states, _ = encode_prompt( + text_encoder, + prompt_input_ids, + args.text_encoder_architecture + ) # [B, L, D] + encoder_hidden_states = encoder_hidden_states.cpu().numpy() # Convert to numpy for saving + except Exception as e: + logger.error(f"[GPU {process_index}] Failed to encode text batch {batch_idx}: {e}") + continue + + # Get the actual dataset indices for this batch + # sampler_indices contains the global dataset indices assigned to this process + local_start_idx = batch_idx * args.batch_size + + # Save features for each sample in the batch + for i in range(batch_size): + local_idx = local_start_idx + i + + if local_idx < len(sampler_indices): + # Get the global dataset index from the sampler + global_dataset_idx = sampler_indices[local_idx] + + # Calculate the final global sample index (accounting for resume_from_index) + sample_idx = args.resume_from_index + global_dataset_idx + + # Get original video path and prompt from dataset for metadata + row = None + if global_dataset_idx < len(dataset.data): + row = dataset.data[global_dataset_idx] + + # Save video codes (if extracted) + video_code = None + if args.extract_video and video_codes is not None: + video_code_dir, video_code_path = get_hierarchical_path(video_codes_dir, sample_idx) + os.makedirs(video_code_dir, exist_ok=True) + video_code = video_codes[i] # [F', H', W'] + # Ensure saved as CPU numpy array with int32 (smaller than int64) + if isinstance(video_code, torch.Tensor): + video_code = video_code.cpu().numpy() + # Convert to int32 to save space (codebook_size is typically < 2^31) + video_code = video_code.astype(np.int32) + np.save(video_code_path, video_code) + + # Save text embedding (if extracted) + text_embedding = None + if args.extract_text and encoder_hidden_states is not None: + text_embedding_dir, text_embedding_path = get_hierarchical_path(text_embeddings_dir, sample_idx) + os.makedirs(text_embedding_dir, exist_ok=True) + text_embedding = encoder_hidden_states[i] # [L, D] + # Ensure saved as CPU numpy array with float16 to save space + if isinstance(text_embedding, torch.Tensor): + text_embedding = text_embedding.cpu().numpy() + # Convert to float16 to save space (half the size of float32) + text_embedding = text_embedding.astype(np.float16) + np.save(text_embedding_path, text_embedding) + + # Add to metadata + if row is not None: + sample_meta = { + "index": sample_idx, + "video_path": row.get("video", ""), + "caption": row.get("caption", ""), + } + if args.extract_video and video_code is not None: + sample_meta["video_code_shape"] = list(video_code.shape) + if args.extract_text and text_embedding is not None: + sample_meta["text_embedding_shape"] = list(text_embedding.shape) + process_metadata["samples"].append(sample_meta) + + process_samples_processed += 1 + + # Save per-process metadata periodically (every 1000 samples per process) + if process_samples_processed % 1000 == 0: + process_metadata_file = os.path.join(args.output_dir, f"metadata_process_{process_index}.json") + process_metadata["num_extracted"] = process_samples_processed + process_metadata["failed_samples"] = process_failed_samples + with open(process_metadata_file, 'w') as f: + json.dump(process_metadata, f, indent=2) + logger.info(f"[GPU {process_index}] Progress: {process_samples_processed} samples extracted") + + # Wait for all processes to finish + accelerator.wait_for_everyone() + + # Save per-process metadata + process_metadata_file = os.path.join(args.output_dir, f"metadata_process_{process_index}.json") + process_metadata["num_extracted"] = process_samples_processed + process_metadata["failed_samples"] = process_failed_samples + with open(process_metadata_file, 'w') as f: + json.dump(process_metadata, f, indent=2) + + logger.info(f"[GPU {process_index}] Process complete: {process_samples_processed} samples extracted") + + # Merge metadata from all processes (only on main process) + accelerator.wait_for_everyone() + + if accelerator.is_main_process: + logger.info("Merging metadata from all processes...") + + # Load all process metadata files + all_samples = [] + total_extracted = 0 + all_failed = [] + + for proc_idx in range(num_processes): + proc_metadata_file = os.path.join(args.output_dir, f"metadata_process_{proc_idx}.json") + if os.path.exists(proc_metadata_file): + with open(proc_metadata_file, 'r') as f: + proc_meta = json.load(f) + all_samples.extend(proc_meta.get("samples", [])) + total_extracted += proc_meta.get("num_extracted", 0) + all_failed.extend(proc_meta.get("failed_samples", [])) + + # Sort samples by index + all_samples.sort(key=lambda x: x["index"]) + + # Get codebook_size, mask_token_id, and empty_embeds info from first process metadata + codebook_size = None + mask_token_id = None + empty_embeds_shape = None + empty_embeds_path = None + for proc_idx in range(num_processes): + proc_metadata_file = os.path.join(args.output_dir, f"metadata_process_{proc_idx}.json") + if os.path.exists(proc_metadata_file): + with open(proc_metadata_file, 'r') as f: + proc_meta = json.load(f) + if proc_meta.get("codebook_size") is not None: + codebook_size = proc_meta.get("codebook_size") + if proc_meta.get("mask_token_id") is not None: + mask_token_id = proc_meta.get("mask_token_id") + if proc_meta.get("empty_embeds_shape") is not None: + empty_embeds_shape = proc_meta.get("empty_embeds_shape") + if proc_meta.get("empty_embeds_path") is not None: + empty_embeds_path = proc_meta.get("empty_embeds_path") + if codebook_size is not None and mask_token_id is not None: + if not args.extract_text or (empty_embeds_shape is not None and empty_embeds_path is not None): + break + + # Create merged metadata + merged_metadata = { + "num_samples": total_samples, + "num_extracted": total_extracted, + "num_processes": num_processes, + "extract_video": args.extract_video, + "extract_text": args.extract_text, + "text_encoder_architecture": args.text_encoder_architecture if args.extract_text else None, + "video_tokenizer_model_id": args.video_tokenizer_model_id if args.extract_video else None, + "codebook_size": codebook_size, + "mask_token_id": mask_token_id, + "num_frames": args.num_frames, + "video_height": args.video_height, + "video_width": args.video_width, + "prompt_prefix": args.prompt_prefix, + "empty_embeds_shape": empty_embeds_shape, + "empty_embeds_path": empty_embeds_path, + "samples": all_samples, + "failed_samples": sorted(set(all_failed)), + } + + # Save merged metadata + with open(metadata_file, 'w') as f: + json.dump(merged_metadata, f, indent=2) + + logger.info(f"Feature extraction complete!") + logger.info(f" Total samples: {total_samples}") + logger.info(f" Extracted: {total_extracted}") + logger.info(f" Failed: {len(merged_metadata['failed_samples'])}") + if args.extract_video: + logger.info(f" Video codes saved to: {video_codes_dir}") + if args.extract_text: + logger.info(f" Text embeddings saved to: {text_embeddings_dir}") + if empty_embeds_path: + logger.info(f" Empty embeds saved to: {os.path.join(args.output_dir, empty_embeds_path)}") + logger.info(f" Metadata saved to: {metadata_file}") + + # Optionally clean up per-process metadata files + # Uncomment if you want to remove them after merging + # for proc_idx in range(num_processes): + # proc_metadata_file = os.path.join(args.output_dir, f"metadata_process_{proc_idx}.json") + # if os.path.exists(proc_metadata_file): + # os.remove(proc_metadata_file) + + +if __name__ == "__main__": + main() + diff --git a/Meissonic/train/infer_mei_video.py b/Meissonic/train/infer_mei_video.py new file mode 100644 index 0000000000000000000000000000000000000000..8c1329d8f708a244cbe05266283a5418822c628d --- /dev/null +++ b/Meissonic/train/infer_mei_video.py @@ -0,0 +1,1320 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import logging +import math +import os +from contextlib import nullcontext +from pathlib import Path +import sys +sys.path.append(os.getcwd()) +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from peft import LoraConfig +from peft.utils import get_peft_model_state_dict +from torch.utils.data import DataLoader, default_collate +from torchvision import transforms +# Video training uses T5/UMT5, not CLIP +import diffusers.optimization +# EMA not used for video training +from src.scheduler_video import Scheduler +# Video training only - no image scheduler needed +from diffusers.loaders import LoraLoaderMixin +from diffusers.utils import is_wandb_available +from src.pipeline_video import Pipeline as VideoPipeline +from torchvision.utils import save_image,make_grid +from datasets import load_dataset +from train.trainer_utils import save_checkpoint +from train.dataset_utils import VideoDataset, OpenVid1MDataset +from train.dataset_utils import tokenize_prompt, encode_prompt +# Transformer2DModel removed - video training only uses WanDiscreteVideoTransformer +from src.transformer_video import WanDiscreteVideoTransformer, WanModel +from src.pipeline_video import CosmosVideoTokenizer +from transformers import T5Tokenizer, T5EncoderModel + +if is_wandb_available(): + import wandb + wandb.login(key="a96e0066098e5f64211a77b604ba2b1dd7bd7834") + +logger = get_logger(__name__, log_level="INFO") + +import torch._dynamo +torch._dynamo.config.verbose = True + +# Optionally suppress errors to fall back to eager execution +torch._dynamo.config.suppress_errors = True + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--text_encoder_architecture", + type=str, + default="open_clip", + required=False, + help="The architecture of the text encoder. For video training, must be 'umt5-base' or 't5'", + ) + parser.add_argument( + "--instance_dataset", + type=str, + default=None, + required=False, + help="The dataset to use for training. One of ['MSCOCO600K', 'PickaPicV2']", + ) + parser.add_argument( + "--training_from_scratch", + type=bool, + default=True, + required=False + ) + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) + parser.add_argument( + "--instance_data_dataset", + type=str, + default=None, + required=False, + help="A Hugging Face dataset containing the training images", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--instance_data_image", type=str, default=None, required=False, help="A single training image" + ) + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") + parser.add_argument("--ema_decay", type=float, default=0.9999) + parser.add_argument("--ema_update_after_step", type=int, default=0) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument( + "--output_dir", + type=str, + default="muse_training", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. " + "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference." + "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components." + "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step" + "instructions." + ), + ) + parser.add_argument( + "--logging_steps", + type=int, + default=50, + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more details" + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=0.0003, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`" + " and logging the images." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="wandb", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument("--validation_prompts", type=str, nargs="*") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument("--split_vae_encode", type=int, required=False, default=None) + parser.add_argument("--min_masking_rate", type=float, default=0.0) + parser.add_argument("--cond_dropout_prob", type=float, default=0.0) + parser.add_argument("--max_grad_norm", default=50.0, type=float, help="Max gradient norm.", required=False) + parser.add_argument("--use_lora", action="store_true", help="Fine tune the model using LoRa") + parser.add_argument("--text_encoder_use_lora", action="store_true", help="Fine tune the model using LoRa") + parser.add_argument("--lora_r", default=16, type=int) + parser.add_argument("--lora_alpha", default=32, type=int) + parser.add_argument("--lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+") + parser.add_argument("--text_encoder_lora_r", default=16, type=int) + parser.add_argument("--text_encoder_lora_alpha", default=32, type=int) + parser.add_argument("--text_encoder_lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+") + parser.add_argument("--train_text_encoder", action="store_true") + parser.add_argument("--image_key", type=str, required=False) + parser.add_argument("--prompt_key", type=str, required=False) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument("--prompt_prefix", type=str, required=False, default=None) + + # Video training specific arguments + # Video training only - model_type is always "video" + parser.add_argument( + "--num_frames", + type=int, + default=16, + help="Number of frames in the video (for video training only)", + ) + parser.add_argument( + "--video_height", + type=int, + default=480, + help="Height of the video in pixels (for video training only)", + ) + parser.add_argument( + "--video_width", + type=int, + default=848, + help="Width of the video in pixels (for video training only)", + ) + parser.add_argument( + "--video_tokenizer_model_id", + type=str, + default="Cosmos-1.0-Tokenizer-DV8x16x16", + help="HuggingFace model ID for Cosmos video tokenizer (for video training only)", + ) + parser.add_argument( + "--wan_pretrained_path", + type=str, + default=None, + help="Path or HuggingFace model ID to Wan pretrained weights. If provided, will load Wan weights into the backbone.", + ) + + args = parser.parse_args() + + if args.report_to == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + + num_datasources = sum( + [x is not None for x in [args.instance_data_dir, args.instance_data_image, args.instance_data_dataset]] + ) + + if num_datasources != 1: + raise ValueError( + "provide one and only one of `--instance_data_dir`, `--instance_data_image`, or `--instance_data_dataset`" + ) + + if args.instance_data_dir is not None: + if not os.path.exists(args.instance_data_dir): + raise ValueError(f"Does not exist: `--args.instance_data_dir` {args.instance_data_dir}") + + if args.instance_data_image is not None: + if not os.path.exists(args.instance_data_image): + raise ValueError(f"Does not exist: `--args.instance_data_image` {args.instance_data_image}") + + if args.instance_data_dataset is not None and (args.image_key is None or args.prompt_key is None): + raise ValueError("`--instance_data_dataset` requires setting `--image_key` and `--prompt_key`") + + return args + +# _prepare_latent_image_ids removed - not used for video training + +def safe_unwrap_model(model, accelerator): + """ + Safely unwrap model from accelerate/distributed wrapper, handling torch.compile. + + The unwrapping order for a compiled + distributed model: + 1. DistributedDataParallel (DDP) → wraps OptimizedModule + 2. OptimizedModule (torch.compile) → has _orig_mod attribute + 3. Original model (WanDiscreteVideoTransformer) + + Args: + model: The model (may be wrapped by accelerate and/or torch.compile) + accelerator: Accelerator instance + + Returns: + The unwrapped model + """ + unwrapped = model + + # Step 1: Unwrap DDP/accelerate wrapper + try: + unwrapped = accelerator.unwrap_model(model) + except (KeyError, AttributeError): + pass + + # Step 2: Handle DDP directly if accelerator.unwrap_model didn't work + if hasattr(unwrapped, 'module'): + unwrapped = unwrapped.module + + # Step 3: Unwrap torch.compile (OptimizedModule has _orig_mod) + if hasattr(unwrapped, '_orig_mod'): + unwrapped = unwrapped._orig_mod + + # Step 4: Handle nested _orig_mod (in case of multiple compile calls) + while hasattr(unwrapped, '_orig_mod'): + unwrapped = unwrapped._orig_mod + + return unwrapped + +def main(args): + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_config=accelerator_project_config, + ) + + if accelerator.is_main_process: + os.makedirs(args.output_dir, exist_ok=True) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + + if accelerator.is_main_process: + accelerator.init_trackers("meissonic", config=vars(copy.deepcopy(args))) + + if args.seed is not None: + set_seed(args.seed) + + # Initialize text encoder and tokenizer for video training (T5/UMT5 only) + if args.text_encoder_architecture == "umt5-base" or args.text_encoder_architecture == "t5": + if args.resume_from_checkpoint: + text_encoder = T5EncoderModel.from_pretrained( + args.resume_from_checkpoint, subfolder="text_encoder", variant=args.variant + ) + tokenizer = T5Tokenizer.from_pretrained( + args.resume_from_checkpoint, subfolder="tokenizer", variant=args.variant + ) + else: + # Default to UMT5-base for video training + model_id = "google/umt5-base" + text_encoder = T5EncoderModel.from_pretrained(model_id) + tokenizer = T5Tokenizer.from_pretrained(model_id) + else: + raise ValueError(f"For video training, text_encoder_architecture must be 'umt5-base' or 't5', got '{args.text_encoder_architecture}'") + + + # Initialize video tokenizer for video training + device = accelerator.device + dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + dtype = torch.bfloat16 + + video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=device, + dtype=dtype + ) + video_tokenizer.requires_grad_(False) + + if args.train_text_encoder: + if args.text_encoder_use_lora: + lora_config = LoraConfig( + r=args.text_encoder_lora_r, + lora_alpha=args.text_encoder_lora_alpha, + target_modules=args.text_encoder_lora_target_modules, + ) + text_encoder.add_adapter(lora_config) + text_encoder.train() + text_encoder.requires_grad_(True) + else: + text_encoder.eval() + text_encoder.requires_grad_(False) + + # Initialize video transformer model + if args.training_from_scratch: + # Calculate compressed dimensions based on Cosmos tokenizer + # Cosmos compresses: F' = F // 8, H' = H // 16, W' = W // 16 + # However, actual encoding may have slight variations due to padding/rounding + # So we test with a dummy video to get the exact dimensions + dummy_video = torch.zeros(1, 3, args.num_frames, args.video_height, args.video_width, + device=accelerator.device, dtype=torch.float32) + with torch.no_grad(): + dummy_tokens = video_tokenizer.encode(dummy_video) # [1, F', H', W'] + F_prime, H_prime, W_prime = dummy_tokens.shape[1], dummy_tokens.shape[2], dummy_tokens.shape[3] + logger.info(f"Actual compressed dimensions from tokenizer: F'={F_prime}, H'={H_prime}, W'={W_prime}") + logger.info(f"Theoretical dimensions: F'={args.num_frames // video_tokenizer.t_downsample}, " + f"H'={args.video_height // video_tokenizer.h_downsample}, " + f"W'={args.video_width // video_tokenizer.w_downsample}") + + # Get text encoder dimension + text_dim_actual = text_encoder.config.d_model + + # If Wan pretrained path is provided, load config from it first + wan_config = None + if args.wan_pretrained_path is not None: + logger.info(f"Loading Wan config from: {args.wan_pretrained_path}") + try: + # Try to load WanModel config + try: + wan_backbone_temp = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder=None, + low_cpu_mem_usage=True, + device_map=None + ) + wan_config = wan_backbone_temp.config + del wan_backbone_temp + except: + try: + wan_backbone_temp = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder="backbone", + low_cpu_mem_usage=True, + device_map=None + ) + wan_config = wan_backbone_temp.config + del wan_backbone_temp + except: + # Try loading config.json directly + import json + config_path = os.path.join(args.wan_pretrained_path, "config.json") + if os.path.exists(config_path): + with open(config_path, 'r') as f: + wan_config_dict = json.load(f) + # Create a simple config object + from types import SimpleNamespace + wan_config = SimpleNamespace(**wan_config_dict) + else: + logger.warning(f"Could not find config in {args.wan_pretrained_path}, using default values") + + if wan_config is not None: + logger.info(f"Loaded Wan config: dim={getattr(wan_config, 'dim', 'N/A')}, " + f"ffn_dim={getattr(wan_config, 'ffn_dim', 'N/A')}, " + f"num_layers={getattr(wan_config, 'num_layers', 'N/A')}, " + f"num_heads={getattr(wan_config, 'num_heads', 'N/A')}") + except Exception as e: + logger.warning(f"Failed to load Wan config: {e}, using default values") + + # Use Wan config if available, otherwise use defaults + dim = getattr(wan_config, 'dim', 2048) if wan_config else 2048 + ffn_dim = getattr(wan_config, 'ffn_dim', 8192) if wan_config else 8192 + num_layers = getattr(wan_config, 'num_layers', 32) if wan_config else 32 + num_heads = getattr(wan_config, 'num_heads', 16) if wan_config else 16 + freq_dim = getattr(wan_config, 'freq_dim', 256) if wan_config else 256 + in_dim = getattr(wan_config, 'in_dim', 16) if wan_config else 16 + out_dim = getattr(wan_config, 'out_dim', 16) if wan_config else 16 + + # text_dim: Use Wan's text_dim if available, but warn if it doesn't match text encoder + wan_text_dim = getattr(wan_config, 'text_dim', None) if wan_config else None + if wan_text_dim is not None and wan_text_dim != text_dim_actual: + logger.warning(f"Wan config text_dim ({wan_text_dim}) doesn't match text encoder dimension ({text_dim_actual}). " + f"Will use text encoder dimension and skip loading text_embedding weights.") + text_dim_for_model = text_dim_actual + else: + # Use Wan's text_dim if it matches, or use text encoder dimension + text_dim_for_model = wan_text_dim if wan_text_dim is not None else text_dim_actual + + model = WanDiscreteVideoTransformer( + codebook_size=video_tokenizer.codebook_size, + vocab_size=video_tokenizer.codebook_size + 1, + num_frames=F_prime, + height=H_prime, + width=W_prime, + model_type='t2v', + patch_size=(1, 2, 2), + text_len=512, + in_dim=in_dim, + dim=dim, + ffn_dim=ffn_dim, + freq_dim=freq_dim, + text_dim=text_dim_for_model, + out_dim=out_dim, + num_heads=num_heads, + num_layers=num_layers, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6 + ) + + # Load Wan pretrained weights into backbone if provided + if args.wan_pretrained_path is not None: + logger.info(f"Loading Wan pretrained weights from: {args.wan_pretrained_path}") + try: + # Check if it's a local path or HuggingFace model ID + is_local_path = os.path.exists(args.wan_pretrained_path) and os.path.isdir(args.wan_pretrained_path) + + if is_local_path: + # Local path: find the state dict file + state_dict_path = None + possible_paths = [ + os.path.join(args.wan_pretrained_path, "diffusion_pytorch_model.safetensors"), + os.path.join(args.wan_pretrained_path, "diffusion_pytorch_model.bin"), + os.path.join(args.wan_pretrained_path, "pytorch_model.bin"), + os.path.join(args.wan_pretrained_path, "model.safetensors"), + ] + for p in possible_paths: + if os.path.exists(p): + state_dict_path = p + break + + if state_dict_path is None: + raise FileNotFoundError(f"Could not find state dict in {args.wan_pretrained_path}") + + logger.info(f"Loading weights from local path: {state_dict_path}") + + # Load state dict from local file + if state_dict_path.endswith('.safetensors'): + from safetensors import safe_open + wan_state_dict = {} + with safe_open(state_dict_path, framework="pt", device="cpu") as f: + for k in f.keys(): + wan_state_dict[k] = f.get_tensor(k) + else: + wan_state_dict = torch.load(state_dict_path, map_location="cpu") + else: + # HuggingFace model ID: try to load using from_pretrained + logger.info(f"Loading weights from HuggingFace Hub: {args.wan_pretrained_path}") + try: + # Try loading as WanModel first + temp_model = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder=None, + low_cpu_mem_usage=False, + device_map=None + ) + wan_state_dict = temp_model.state_dict() + del temp_model + except: + # If that fails, try with 'backbone' subfolder + try: + temp_model = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder="backbone", + low_cpu_mem_usage=False, + device_map=None + ) + wan_state_dict = temp_model.state_dict() + del temp_model + except: + # Last resort: try to download and load state dict directly + from huggingface_hub import hf_hub_download + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + # Try different possible filenames + possible_files = [ + "diffusion_pytorch_model.safetensors", + "diffusion_pytorch_model.bin", + "pytorch_model.bin", + "model.safetensors", + ] + state_dict_path = None + for filename in possible_files: + try: + state_dict_path = hf_hub_download( + repo_id=args.wan_pretrained_path, + filename=filename, + cache_dir=tmpdir + ) + break + except: + continue + + if state_dict_path is None: + raise FileNotFoundError( + f"Could not find state dict file in HuggingFace model {args.wan_pretrained_path}" + ) + + # Load state dict + if state_dict_path.endswith('.safetensors'): + from safetensors import safe_open + wan_state_dict = {} + with safe_open(state_dict_path, framework="pt", device="cpu") as f: + for k in f.keys(): + wan_state_dict[k] = f.get_tensor(k) + else: + wan_state_dict = torch.load(state_dict_path, map_location="cpu") + + # Remove text_embedding weights if text_dim doesn't match + # This is necessary when using a different text encoder (e.g., UMT5-base with 768 dim + # vs Wan's original 4096 dim) + if wan_text_dim is not None and wan_text_dim != text_dim_actual: + keys_to_remove = [k for k in wan_state_dict.keys() if 'text_embedding' in k] + for k in keys_to_remove: + del wan_state_dict[k] + logger.info(f"Removed {len(keys_to_remove)} text_embedding keys due to dimension mismatch " + f"(pretrained: {wan_text_dim}, current: {text_dim_actual})") + + # Load into model's backbone + missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) + + # Log results + if missing_keys: + # Filter out expected missing keys (text_embedding if removed) + actual_missing = [k for k in missing_keys if 'text_embedding' not in k] + if actual_missing: + logger.warning(f"Missing keys when loading Wan weights: {actual_missing[:10]}..." + if len(actual_missing) > 10 else f"Missing keys: {actual_missing}") + else: + logger.info(f"Only text_embedding keys are missing (expected due to text_dim mismatch)") + if unexpected_keys: + logger.warning(f"Unexpected keys when loading Wan weights: {unexpected_keys[:10]}..." + if len(unexpected_keys) > 10 else f"Unexpected keys: {unexpected_keys}") + + logger.info("✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)") + + except Exception as e: + logger.warning(f"Failed to load Wan pretrained weights: {e}") + import traceback + traceback.print_exc() + logger.warning("Continuing with random initialization") + else: + # Load from pretrained checkpoint + model = WanDiscreteVideoTransformer.from_pretrained( + args.pretrained_model_name_or_path, subfolder="transformer", low_cpu_mem_usage=False, device_map=None + ) + + # Save vocab_size before torch.compile (for use in training loop) + # This avoids issues with accelerate.unwrap_model when using torch.compile + vocab_size = model.vocab_size + + # Convert model to correct dtype before torch.compile + # This ensures all layers (especially text_embedding which is randomly initialized) are on the right dtype + if accelerator.mixed_precision == "fp16": + model = model.to(dtype=torch.float16) + elif accelerator.mixed_precision == "bf16": + model = model.to(dtype=torch.bfloat16) + # else: keep float32 + + model = torch.compile(model) + + if args.use_lora: + lora_config = LoraConfig( + r=args.lora_r, + lora_alpha=args.lora_alpha, + target_modules=args.lora_target_modules, + ) + model.add_adapter(lora_config) + + model.train() + + if args.gradient_checkpointing: + model.enable_gradient_checkpointing() + if args.train_text_encoder: + text_encoder.gradient_checkpointing_enable() + + # EMA is not used for video training + ema = None + + def save_model_hook(models, weights, output_dir): + if accelerator.is_main_process: + transformer_lora_layers_to_save = None + text_encoder_lora_layers_to_save = None + + for model_ in models: + # Unwrap model_ to get the actual model type (handles torch.compile wrapping) + unwrapped_model_ = safe_unwrap_model(model_, accelerator) + + # Use class name comparison for more robust type checking + # This handles cases where the same class might be loaded from different modules + model_class_name = unwrapped_model_.__class__.__name__ + + if model_class_name == "WanDiscreteVideoTransformer": + if args.use_lora: + transformer_lora_layers_to_save = get_peft_model_state_dict(model_) + else: + # Unwrap before saving to avoid torch.compile issues + unwrapped_model_.save_pretrained(os.path.join(output_dir, "transformer")) + elif model_class_name in ["T5EncoderModel", "T5Model"]: + if args.text_encoder_use_lora: + text_encoder_lora_layers_to_save = get_peft_model_state_dict(model_) + else: + # Unwrap before saving to avoid torch.compile issues + unwrapped_model_.save_pretrained(os.path.join(output_dir, "text_encoder")) + else: + raise ValueError(f"unexpected save model: {model_.__class__}, unwrapped: {unwrapped_model_.__class__.__name__}") + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None: + LoraLoaderMixin.save_lora_weights( + output_dir, + unet_lora_layers=transformer_lora_layers_to_save, + text_encoder_lora_layers=text_encoder_lora_layers_to_save, + ) + + # EMA not used for video training + + def load_model_hook(models, input_dir): + transformer = None + text_encoder_ = None + + # this part is added for keep consistency when add model.compile() in the model + def adap_compile(ori_dict):#add '_orig_mod.' to each key + new_dict = {} + for k,v in ori_dict.items(): + new_dict['_orig_mod.'+k] = v + return new_dict + + while len(models) > 0: + model_ = models.pop() + + # Unwrap model to get the actual class name + unwrapped_model_ = safe_unwrap_model(model_, accelerator) + model_class_name = unwrapped_model_.__class__.__name__ + + if model_class_name == "WanDiscreteVideoTransformer": + if args.use_lora: + transformer = model_ + else: + load_model = WanDiscreteVideoTransformer.from_pretrained(os.path.join(input_dir, "transformer"), low_cpu_mem_usage=False, device_map=None) + model_.load_state_dict(adap_compile(load_model.state_dict())) + del load_model + elif model_class_name in ["T5EncoderModel", "T5Model"]: + if args.text_encoder_use_lora: + text_encoder_ = model_ + else: + try: + load_model = T5EncoderModel.from_pretrained(os.path.join(input_dir, "text_encoder")) + model_.load_state_dict(load_model.state_dict()) + except: + print('Not found text-encoder model in current folder. Loading default UMT5-base.') + load_model = T5EncoderModel.from_pretrained("google/umt5-base") + model_.load_state_dict(load_model.state_dict()) + del load_model + else: + raise ValueError(f"unexpected load model: {model_.__class__}, unwrapped: {model_class_name}") + + if transformer is not None or text_encoder_ is not None: + lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + LoraLoaderMixin.load_lora_into_text_encoder( + lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_ + ) + LoraLoaderMixin.load_lora_into_transformer( + lora_state_dict, network_alphas=network_alphas, transformer=transformer + ) + + # EMA not used for video training + + accelerator.register_load_state_pre_hook(load_model_hook) + accelerator.register_save_state_pre_hook(save_model_hook) + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + ) + + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`" + ) + + optimizer_cls = bnb.optim.AdamW8bit + else: + optimizer_cls = torch.optim.AdamW + + # no decay on bias and layernorm and embedding + no_decay = ["bias", "layer_norm.weight", "mlm_ln.weight", "embeddings.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.adam_weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + + if args.train_text_encoder: + optimizer_grouped_parameters.append( + {"params": text_encoder.parameters(), "weight_decay": args.adam_weight_decay} + ) + + optimizer = optimizer_cls( + optimizer_grouped_parameters, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + logger.info("Creating dataloaders and lr_scheduler") + + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + # Video training datasets + if args.instance_dataset == 'OpenVid1MDataset': + # OpenVid1M dataset from CSV file + csv_path = args.instance_data_dir + if not os.path.exists(csv_path): + raise FileNotFoundError(f"CSV file not found: {csv_path}") + + # Video root directory: assume videos are in the same directory as CSV or in a 'video_reorg' subdirectory + csv_dir = os.path.dirname(csv_path) + # Try to find video directory + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + # Fallback: use CSV directory + video_root_dir = csv_dir + logger.warning(f"Video directory not found, using CSV directory: {video_root_dir}") + + dataset = OpenVid1MDataset( + csv_path=csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture, + prompt_prefix=args.prompt_prefix, + ) + elif args.instance_dataset == 'HuggingFaceDataset' or args.instance_dataset == 'VideoDataset': + dataset = VideoDataset( + hf_dataset=load_dataset(args.instance_data_dir, split="train"), + tokenizer=tokenizer, + video_key=args.image_key if args.image_key else "video", + prompt_key=args.prompt_key if args.prompt_key else "caption", + prompt_prefix=args.prompt_prefix, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture + ) + else: + raise ValueError(f"For video training, instance_dataset must be 'OpenVid1MDataset', 'HuggingFaceDataset' or 'VideoDataset', got '{args.instance_dataset}'") + + train_dataloader = DataLoader( + dataset, + batch_size=args.train_batch_size, + shuffle=True, + num_workers=args.dataloader_num_workers, + collate_fn=default_collate, + pin_memory=True, + ) + train_dataloader.num_batches = len(train_dataloader) + + lr_scheduler = diffusers.optimization.get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_training_steps=args.max_train_steps * accelerator.num_processes, + num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes, + ) + + logger.info("Preparing model, optimizer and dataloaders") + + if args.train_text_encoder: + model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader, text_encoder + ) + else: + model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader + ) + + train_dataloader.num_batches = len(train_dataloader) + + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + if not args.train_text_encoder: + text_encoder.to(device=accelerator.device, dtype=weight_dtype) + + # Video tokenizer is already on the correct device + + # EMA not used for video training + + with nullcontext() if args.train_text_encoder else torch.no_grad(): + # T5/UMT5 doesn't have cond_embeds, only encoder_hidden_states + empty_embeds, _ = encode_prompt( + text_encoder, tokenize_prompt(tokenizer, "", args.text_encoder_architecture).to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + empty_clip_embeds = None # Not used for T5 + + # Video training doesn't use instance_data_image + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps) + # Afterwards we recalculate our number of training epochs. + # Note: We are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num training steps = {args.max_train_steps}") + logger.info(f" Instantaneous batch size per device = { args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + + resume_from_checkpoint = args.resume_from_checkpoint + if resume_from_checkpoint: + if resume_from_checkpoint == "latest": + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + if len(dirs) > 0: + resume_from_checkpoint = os.path.join(args.output_dir, dirs[-1]) + else: + resume_from_checkpoint = None + + if resume_from_checkpoint is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + else: + accelerator.print(f"Resuming from checkpoint {resume_from_checkpoint}") + + if resume_from_checkpoint is None: + global_step = 0 + first_epoch = 0 + else: + accelerator.load_state(resume_from_checkpoint) + global_step = int(os.path.basename(resume_from_checkpoint).split("-")[1]) + first_epoch = global_step // num_update_steps_per_epoch + + # EMA not used for video training + + # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + for epoch in range(first_epoch, num_train_epochs): + for batch in train_dataloader: + torch.cuda.empty_cache() + with torch.no_grad(): + # Video training path + video_values = batch["video"].to(accelerator.device, non_blocking=True) # [B, C, F, H, W] + batch_size = video_values.shape[0] + + # Encode video to discrete tokens using CosmosVideoTokenizer + split_batch_size = args.split_vae_encode if args.split_vae_encode is not None else batch_size + num_splits = math.ceil(batch_size / split_batch_size) + video_tokens = [] + for i in range(num_splits): + start_idx = i * split_batch_size + end_idx = min((i + 1) * split_batch_size, batch_size) + # video_values: [B, C, F, H, W] + tokens = video_tokenizer.encode(video_values[start_idx:end_idx]) # [B, F', H', W'] + video_tokens.append(tokens) + video_tokens = torch.cat(video_tokens, dim=0) # [B, F', H', W'] + + # Flatten video tokens for masking: [B, F', H', W'] -> [B, F'*H'*W'] + B, F_prime, H_prime, W_prime = video_tokens.shape + seq_len = F_prime * H_prime * W_prime + video_tokens_flat = video_tokens.view(B, seq_len) # [B, seq_len] + + timesteps = torch.rand(batch_size, device=video_tokens_flat.device) + mask_prob = torch.cos(timesteps * math.pi * 0.5) + mask_prob = mask_prob.clip(args.min_masking_rate) + + num_token_masked = (seq_len * mask_prob).round().clamp(min=1) + batch_randperm = torch.rand(batch_size, seq_len, device=video_tokens_flat.device).argsort(dim=-1) + mask = batch_randperm < num_token_masked.unsqueeze(-1) + + mask_id = video_tokenizer.mask_token_id # codebook_size + input_ids_flat = torch.where(mask, mask_id, video_tokens_flat) + labels_flat = torch.where(mask, video_tokens_flat, -100) + + # Reshape back to [B, F', H', W'] for model forward + input_ids = input_ids_flat.view(B, F_prime, H_prime, W_prime) + labels = labels_flat.view(B, F_prime, H_prime, W_prime) + + if "prompt_input_ids" in batch: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + + if args.cond_dropout_prob > 0.0: + assert encoder_hidden_states is not None + + batch_size = encoder_hidden_states.shape[0] + + mask = ( + torch.zeros((batch_size, 1, 1), device=encoder_hidden_states.device).float().uniform_(0, 1) + < args.cond_dropout_prob + ) + + empty_embeds_ = empty_embeds.expand(batch_size, -1, -1) + encoder_hidden_states = torch.where( + (encoder_hidden_states * mask).bool(), encoder_hidden_states, empty_embeds_ + ) + + # Handle cond_embeds dropout (only for CLIP, not for T5) + # For T5/UMT5, cond_embeds is None, so skip this step + + # Video tokens are already in [B, F', H', W'] format, no need to reshape + + if "prompt_input_ids" in batch: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + + # Train Step + with accelerator.accumulate(model): + # Video training: use WanDiscreteVideoTransformer + # vocab_size is already saved before torch.compile + + # Prepare timesteps: [B] -> [B] (scalar timesteps for video) + timesteps_tensor = (mask_prob * 1000).long().to(input_ids.device) + + # Ensure encoder_hidden_states is on correct dtype + if encoder_hidden_states is not None: + if accelerator.mixed_precision == "fp16" and encoder_hidden_states.dtype != torch.float16: + encoder_hidden_states = encoder_hidden_states.to(dtype=torch.float16) + elif accelerator.mixed_precision == "bf16" and encoder_hidden_states.dtype != torch.bfloat16: + encoder_hidden_states = encoder_hidden_states.to(dtype=torch.bfloat16) + elif accelerator.mixed_precision == "no" and encoder_hidden_states.dtype != torch.float32: + encoder_hidden_states = encoder_hidden_states.to(dtype=torch.float32) + + # Forward pass: input_ids is [B, F', H', W'], encoder_hidden_states is [B, L, D] + logits = model( + tokens=input_ids, # [B, F', H', W'] + timesteps=timesteps_tensor, # [B] + encoder_hidden_states=encoder_hidden_states, # [B, L, D] + y=None, + ) # Returns [B, vocab_size, F', H', W'] + + # Reshape logits and labels for loss computation + # logits: [B, vocab_size, F', H', W'] -> [B*F'*H'*W', vocab_size] + B, vocab_size, F_prime_logits, H_prime_logits, W_prime_logits = logits.shape + logits = logits.permute(0, 2, 3, 4, 1).reshape(B * F_prime_logits * H_prime_logits * W_prime_logits, vocab_size) + + # labels: [B, F', H', W'] - may have different dimensions due to patch/unpatch operations + # Crop labels to match logits dimensions if needed + B_labels, F_prime_labels, H_prime_labels, W_prime_labels = labels.shape + assert B == B_labels, f"Batch size mismatch: logits {B} vs labels {B_labels}" + + # Crop labels to match logits spatial dimensions + if F_prime_labels != F_prime_logits or H_prime_labels != H_prime_logits or W_prime_labels != W_prime_logits: + # Crop labels to match logits dimensions + labels = labels[:, :F_prime_logits, :H_prime_logits, :W_prime_logits] + + # labels: [B, F', H', W'] -> [B*F'*H'*W'] + labels_flat = labels.reshape(-1) + + loss = F.cross_entropy( + logits, + labels_flat, + ignore_index=-100, + reduction="mean", + ) + + # Gather the losses across all processes for logging (if we use distributed training). + avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() + avg_masking_rate = accelerator.gather(mask_prob.repeat(args.train_batch_size)).mean() + + accelerator.backward(loss) + + if args.max_grad_norm is not None and accelerator.sync_gradients: + accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + + optimizer.zero_grad(set_to_none=True) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + # EMA not used for video training + + if (global_step + 1) % args.logging_steps == 0: + logs = { + "step_loss": avg_loss.item(), + "lr": lr_scheduler.get_last_lr()[0], + "avg_masking_rate": avg_masking_rate.item(), + } + accelerator.log(logs, step=global_step + 1) + + logger.info( + f"Step: {global_step + 1} " + f"Loss: {avg_loss.item():0.4f} " + f"LR: {lr_scheduler.get_last_lr()[0]:0.6f}" + ) + + if (global_step + 1) % args.checkpointing_steps == 0: + save_checkpoint(args, accelerator, global_step + 1, logger) + + if (global_step + 1) % args.validation_steps == 0 and accelerator.is_main_process: + # EMA not used for video training + + with torch.no_grad(): + logger.info("Generating videos for validation...") + + model.eval() + + if args.train_text_encoder: + text_encoder.eval() + + # Video pipeline validation + logger.info("Generating videos for validation...") + + # For video, create scheduler with mask_token_id + scheduler = Scheduler( + mask_token_id=video_tokenizer.mask_token_id, + masking_schedule="cosine" + ) + scheduler.set_timesteps(num_inference_steps=48, device=accelerator.device) + + # Get unwrapped transformer and ensure it's on correct dtype + unwrapped_transformer = safe_unwrap_model(model, accelerator) + # Ensure transformer is on the correct dtype (text_embedding was randomly initialized as float32) + unwrapped_transformer = unwrapped_transformer.to(dtype=weight_dtype) + + pipe = VideoPipeline( + tokenizer=tokenizer, + text_encoder=text_encoder, + transformer=unwrapped_transformer, + scheduler=scheduler, + video_tokenizer=video_tokenizer, + text_len=512, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + ) + + # Generate videos + try: + videos = pipe( + prompt=args.validation_prompts, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + guidance_scale=9.0, + num_inference_steps=48, + output_type="pil", + ).videos + + # Log videos to wandb (save first frame of each video) + if is_wandb_available(): + wandb_images = [] + for i, video in enumerate(videos): + if isinstance(video, list) and len(video) > 0: + first_frame = video[0] + elif isinstance(video, torch.Tensor): + first_frame = transforms.ToPILImage()(video[:, 0, :, :].clamp(0, 1)) + else: + first_frame = video + if first_frame is not None: + prompt_caption = args.validation_prompts[i] if i < len(args.validation_prompts) else f"video_{i}" + wandb_images.append(wandb.Image(first_frame, caption=prompt_caption)) + if wandb_images: + wandb.log({"generated_videos_first_frame": wandb_images}, step=global_step + 1) + + # Save video frames as grid + for i, video in enumerate(videos): + if isinstance(video, list): + frames = [transforms.ToTensor()(frame) for frame in video] + if frames: + frames_tensor = torch.stack(frames, dim=0) + grid = make_grid(frames_tensor, nrow=min(4, len(frames))) + save_image(grid, os.path.join(args.output_dir, f"{global_step}_video_{i}_CFG-9.png")) + elif isinstance(video, torch.Tensor): + C, num_frames_video, H, W = video.shape + frames_list = [video[:, f, :, :] for f in range(num_frames_video)] + frames_tensor = torch.stack(frames_list, dim=0) + grid = make_grid(frames_tensor, nrow=min(4, num_frames_video)) + save_image(grid, os.path.join(args.output_dir, f"{global_step}_video_{i}_CFG-9.png")) + + logger.info(f"Validation videos saved to {args.output_dir}") + except Exception as e: + logger.error(f"Video validation failed: {e}") + import traceback + traceback.print_exc() + + + + model.train() + + if args.train_text_encoder: + text_encoder.train() + + # EMA not used for video training + + global_step += 1 + + # Stop training if max steps is reached + if global_step >= args.max_train_steps: + break + # End for + + accelerator.wait_for_everyone() + + # Evaluate and save checkpoint at the end of training + save_checkpoint(args, accelerator, global_step, logger) + + # Save the final trained checkpoint + if accelerator.is_main_process: + model = safe_unwrap_model(model, accelerator) + # EMA not used for video training + model.save_pretrained(args.output_dir) + + accelerator.end_training() + + + + + +if __name__ == "__main__": + main(parse_args()) + + diff --git a/Meissonic/train/run_overfit.sh b/Meissonic/train/run_overfit.sh new file mode 100644 index 0000000000000000000000000000000000000000..0091e75da8bf68d04fe04dbf96b18c41e7d84ee1 --- /dev/null +++ b/Meissonic/train/run_overfit.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Overfitting experiment script +# This script runs a small overfitting experiment to verify implementation correctness + +accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 train/train_overfit.py \ + --text_encoder_architecture umt5-base \ + --video_tokenizer_model_id "Cosmos-1.0-Tokenizer-DV8x16x16" \ + --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ + --max_samples 256 \ + --num_frames 8 \ + --video_height 64 \ + --video_width 112 \ + --dataloader_num_workers 8 \ + --train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --learning_rate 5e-4 \ + --max_train_steps 3000 \ + --lr_warmup_steps 100 \ + --gradient_checkpointing \ + --mixed_precision bf16 \ + --seed 42 \ + --output_dir "./output_overfit" \ + --logging_steps 50 \ + --save_steps 500 \ + --inference_steps 500 \ + --num_inference_samples 4 \ + --num_inference_steps 48 \ + --wan_pretrained_path Wan-AI/Wan2.1-T2V-1.3B # Optional: path to pretrained Wan weights + diff --git a/Meissonic/train/test_cosmos_vqvae.py b/Meissonic/train/test_cosmos_vqvae.py new file mode 100644 index 0000000000000000000000000000000000000000..30a15da3870c15fa471dc39078c7e5f55cb4ab53 --- /dev/null +++ b/Meissonic/train/test_cosmos_vqvae.py @@ -0,0 +1,500 @@ +#!/usr/bin/env python3 +""" +Test script for Cosmos VQ-VAE performance. + +This script: +1. Loads a video from the training dataset +2. Encodes it using CosmosVideoTokenizer +3. Decodes it back +4. Computes metrics (PSNR, SSIM, MSE) +5. Creates a side-by-side comparison video +6. Saves the results +""" + +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import torch +import numpy as np +from PIL import Image +import cv2 +from torchvision import transforms +from torchvision.utils import make_grid, save_image + +from src.pipeline_video import CosmosVideoTokenizer +from train.dataset_utils import OpenVid1MDataset, TinyOpenVid1MDataset +from transformers import T5Tokenizer + + +def calculate_psnr(img1, img2, max_val=1.0): + """Calculate PSNR between two images.""" + mse = torch.mean((img1 - img2) ** 2) + if mse == 0: + return float('inf') + psnr = 20 * torch.log10(max_val / torch.sqrt(mse)) + return psnr.item() + + +def calculate_mse(img1, img2): + """Calculate MSE between two images.""" + return torch.mean((img1 - img2) ** 2).item() + + +def calculate_ssim(img1, img2, window_size=11): + """Calculate SSIM between two images (simplified version).""" + # Simple SSIM approximation + C1 = 0.01 ** 2 + C2 = 0.03 ** 2 + + mu1 = img1.mean() + mu2 = img2.mean() + + sigma1_sq = img1.var() + sigma2_sq = img2.var() + sigma12 = ((img1 - mu1) * (img2 - mu2)).mean() + + ssim = ((2 * mu1 * mu2 + C1) * (2 * sigma12 + C2)) / ((mu1**2 + mu2**2 + C1) * (sigma1_sq + sigma2_sq + C2)) + return ssim.item() + + +def video_to_numpy(video_tensor): + """ + Convert video tensor [C, F, H, W] in [0, 1] to numpy array [F, H, W, C] in [0, 255] (RGB). + """ + if isinstance(video_tensor, torch.Tensor): + # [C, F, H, W] -> [F, C, H, W] -> [F, H, W, C] + # First move frame dimension to front, then transpose channels to last + video_np = video_tensor.permute(1, 0, 2, 3).cpu().numpy() # [F, C, H, W] + video_np = np.transpose(video_np, (0, 2, 3, 1)) # [F, H, W, C] + # Clamp to [0, 1] and convert to [0, 255] + video_np = np.clip(video_np, 0, 1) + video_np = (video_np * 255).astype(np.uint8) + else: + video_np = np.array(video_tensor) + return video_np + + +def create_side_by_side_video(original, reconstructed, output_path, fps=8): + """ + Create a side-by-side comparison video. + + Args: + original: Original video tensor [C, F, H, W] or numpy array + reconstructed: Reconstructed video tensor [C, F, H, W] or numpy array + output_path: Path to save the output video + fps: Frames per second + """ + # Convert to numpy (RGB format: [F, H, W, C]) + orig_np = video_to_numpy(original) + recon_np = video_to_numpy(reconstructed) + + # Get dimensions + F, H, W, C = orig_np.shape + F_recon, H_recon, W_recon, C_recon = recon_np.shape + + # Ensure same number of frames + F_min = min(F, F_recon) + orig_np = orig_np[:F_min] + recon_np = recon_np[:F_min] + + # Resize if dimensions don't match + if H != H_recon or W != W_recon: + print(f"Resizing reconstructed video from ({H_recon}, {W_recon}) to ({H}, {W})") + recon_np_resized = np.zeros((F_min, H, W, C), dtype=np.uint8) + for f in range(F_min): + # cv2.resize expects (width, height) for size parameter + recon_np_resized[f] = cv2.resize(recon_np[f], (W, H), interpolation=cv2.INTER_LINEAR) + recon_np = recon_np_resized + + # Add text labels to frames + from PIL import Image, ImageDraw, ImageFont + side_by_side_frames = [] + for f in range(F_min): + # Original frame with label + orig_frame_pil = Image.fromarray(orig_np[f]) + draw = ImageDraw.Draw(orig_frame_pil) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 32) + except: + try: + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 32) + except: + font = ImageFont.load_default() + # Draw text with outline for visibility + text = "Original" + x, y = 20, 20 + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + draw.text((x, y), text, font=font, fill=(255, 255, 255)) + orig_frame = np.array(orig_frame_pil) + + # Reconstructed frame with label + recon_frame_pil = Image.fromarray(recon_np[f]) + draw = ImageDraw.Draw(recon_frame_pil) + text = "Reconstructed" + x, y = 20, 20 + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + draw.text((x, y), text, font=font, fill=(255, 255, 0)) # Yellow text + recon_frame = np.array(recon_frame_pil) + + # Concatenate horizontally + frame = np.concatenate([orig_frame, recon_frame], axis=1) + side_by_side_frames.append(frame) + + # Write video using OpenCV (needs BGR format) + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(output_path, fourcc, fps, (W * 2, H)) + + if not out.isOpened(): + print(f"Warning: Could not open video writer with mp4v codec, trying XVID...") + fourcc = cv2.VideoWriter_fourcc(*'XVID') + out = cv2.VideoWriter(output_path, fourcc, fps, (W * 2, H)) + + for frame in side_by_side_frames: + # Convert RGB to BGR for OpenCV + frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + out.write(frame_bgr) + + out.release() + print(f"Saved side-by-side video to: {output_path}") + + +def add_text_to_image(image_tensor, text, position=(10, 30)): + """ + Add text label to an image tensor. + + Args: + image_tensor: Image tensor [C, H, W] in [0, 1] + text: Text to add + position: (x, y) position for text + Returns: + Image tensor with text [C, H, W] + """ + # Convert to PIL Image + image_np = image_tensor.permute(1, 2, 0).cpu().numpy() # [H, W, C] + image_np = np.clip(image_np, 0, 1) + image_np = (image_np * 255).astype(np.uint8) + pil_image = Image.fromarray(image_np) + + # Add text + from PIL import ImageDraw, ImageFont + draw = ImageDraw.Draw(pil_image) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 24) + except: + try: + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) + except: + font = ImageFont.load_default() + + # Draw white text with black outline + x, y = position + # Draw outline + for adj in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + draw.text((x + adj[0], y + adj[1]), text, font=font, fill=(0, 0, 0)) + # Draw main text + draw.text((x, y), text, font=font, fill=(255, 255, 255)) + + # Convert back to tensor + image_tensor = transforms.ToTensor()(pil_image) + return image_tensor + + +def create_comparison_grid(original, reconstructed, output_path, nrow=4): + """ + Create a grid image comparing original and reconstructed frames. + + Args: + original: Original video tensor [C, F, H, W] + reconstructed: Reconstructed video tensor [C, F, H, W] + output_path: Path to save the grid image + nrow: Number of frames per row + """ + # Get number of frames + F = min(original.shape[1], reconstructed.shape[1]) + + # Select frames to display + num_frames_to_show = min(8, F) + frame_indices = np.linspace(0, F - 1, num_frames_to_show, dtype=int) + + frames_list = [] + for idx in frame_indices: + # Original frame with label + orig_frame = original[:, idx, :, :].clone() # [C, H, W] + orig_frame = add_text_to_image(orig_frame, "Original", position=(10, 10)) + frames_list.append(orig_frame) + + # Reconstructed frame with label + recon_frame = reconstructed[:, idx, :, :].clone() # [C, H, W] + recon_frame = add_text_to_image(recon_frame, "Reconstructed", position=(10, 10)) + frames_list.append(recon_frame) + + # Create grid + frames_tensor = torch.stack(frames_list, dim=0) + grid = make_grid(frames_tensor, nrow=nrow * 2, padding=2, pad_value=1.0) + + save_image(grid, output_path) + print(f"Saved comparison grid to: {output_path}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Test Cosmos VQ-VAE performance") + + parser.add_argument( + "--csv_path", + type=str, + required=True, + help="Path to OpenVid1M CSV file" + ) + parser.add_argument( + "--video_root_dir", + type=str, + default=None, + help="Root directory for videos (auto-detected if not provided)" + ) + parser.add_argument( + "--video_index", + type=int, + default=0, + help="Index of video to test (default: 0)" + ) + parser.add_argument( + "--video_tokenizer_model_id", + type=str, + default="Cosmos-1.0-Tokenizer-DV8x16x16", + help="Cosmos tokenizer model ID" + ) + parser.add_argument( + "--num_frames", + type=int, + default=16, + help="Number of frames" + ) + parser.add_argument( + "--height", + type=int, + default=480, + help="Video height" + ) + parser.add_argument( + "--width", + type=int, + default=848, + help="Video width" + ) + parser.add_argument( + "--output_dir", + type=str, + default="./cosmos_test_output", + help="Output directory for results" + ) + parser.add_argument( + "--device", + type=str, + default="cuda" if torch.cuda.is_available() else "cpu", + help="Device to use" + ) + parser.add_argument( + "--dtype", + type=str, + default="float32", + choices=["float32", "float16", "bfloat16"], + help="Data type" + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + # Create output directory + os.makedirs(args.output_dir, exist_ok=True) + + # Set device and dtype + device = torch.device(args.device) + if args.dtype == "float16": + dtype = torch.float16 + elif args.dtype == "bfloat16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + + print(f"Using device: {device}, dtype: {dtype}") + + # Initialize tokenizer + print("Initializing CosmosVideoTokenizer...") + video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=device, + dtype=dtype + ) + print(f"Codebook size: {video_tokenizer.codebook_size}") + print(f"Downsampling factors: t={video_tokenizer.t_downsample}, " + f"h={video_tokenizer.h_downsample}, w={video_tokenizer.w_downsample}") + + # Load dataset + print(f"Loading dataset from: {args.csv_path}") + + # Auto-detect video_root_dir if not provided + video_root_dir = args.video_root_dir + if video_root_dir is None: + csv_dir = os.path.dirname(args.csv_path) + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + video_root_dir = csv_dir + print(f"Warning: Video directory not found, using CSV directory: {video_root_dir}") + + # Initialize tokenizer for dataset (needed for OpenVid1MDataset) + tokenizer = T5Tokenizer.from_pretrained("google/umt5-base") + + # Create dataset + dataset = OpenVid1MDataset( + csv_path=args.csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.height, + width=args.width, + text_encoder_architecture="umt5-base", + ) + + print(f"Dataset size: {len(dataset)}") + + # Load video + if args.video_index >= len(dataset): + print(f"Error: video_index {args.video_index} >= dataset size {len(dataset)}") + return + + print(f"Loading video at index {args.video_index}...") + sample = dataset[args.video_index] + original_video = sample["video"] # [C, F, H, W] + + # Get video info from dataset + row = dataset.data[args.video_index] + video_path = row.get('video', 'unknown') + caption = row.get('caption', 'no caption') + + print(f"Video path: {video_path}") + print(f"Caption: {caption}") + print(f"Original video shape: {original_video.shape}") + print(f"Original video range: [{original_video.min():.3f}, {original_video.max():.3f}]") + + # Move to device + original_video = original_video.to(device=device, dtype=dtype) + + # Encode + print("\nEncoding video...") + with torch.no_grad(): + codes = video_tokenizer.encode(original_video.unsqueeze(0)) # [1, F', H', W'] + + print(f"Encoded codes shape: {codes.shape}") + print(f"Codes range: [{codes.min().item()}, {codes.max().item()}]") + print(f"Codebook size: {video_tokenizer.codebook_size}") + + # Decode + print("\nDecoding video...") + with torch.no_grad(): + reconstructed_video = video_tokenizer.decode(codes) # [1, C, F, H, W] + reconstructed_video = reconstructed_video.squeeze(0) # [C, F, H, W] + + print(f"Reconstructed video shape: {reconstructed_video.shape}") + print(f"Reconstructed video range: [{reconstructed_video.min():.3f}, {reconstructed_video.max():.3f}]") + + # Ensure same number of frames for comparison + F_orig = original_video.shape[1] + F_recon = reconstructed_video.shape[1] + F_min = min(F_orig, F_recon) + + original_video = original_video[:, :F_min, :, :] + reconstructed_video = reconstructed_video[:, :F_min, :, :] + + # Resize if spatial dimensions don't match + if original_video.shape[2:] != reconstructed_video.shape[2:]: + print(f"Resizing reconstructed video from {reconstructed_video.shape[2:]} to {original_video.shape[2:]}") + # Use interpolation to resize + reconstructed_video_resized = torch.zeros_like(original_video) + for f in range(F_min): + frame = reconstructed_video[:, f, :, :].unsqueeze(0) # [1, C, H, W] + frame_resized = torch.nn.functional.interpolate( + frame, size=original_video.shape[2:], mode='bilinear', align_corners=False + ) + reconstructed_video_resized[:, f, :, :] = frame_resized.squeeze(0) + reconstructed_video = reconstructed_video_resized + + # Calculate metrics + print("\nCalculating metrics...") + + # Convert to float32 for metric calculation + orig_f32 = original_video.to(torch.float32) + recon_f32 = reconstructed_video.to(torch.float32) + + # Frame-wise metrics + psnr_values = [] + mse_values = [] + ssim_values = [] + + for f in range(F_min): + orig_frame = orig_f32[:, f, :, :] # [C, H, W] + recon_frame = recon_f32[:, f, :, :] # [C, H, W] + + psnr = calculate_psnr(orig_frame, recon_frame) + mse = calculate_mse(orig_frame, recon_frame) + ssim = calculate_ssim(orig_frame, recon_frame) + + psnr_values.append(psnr) + mse_values.append(mse) + ssim_values.append(ssim) + + # Overall metrics + avg_psnr = np.mean(psnr_values) + avg_mse = np.mean(mse_values) + avg_ssim = np.mean(ssim_values) + + print(f"\n=== Metrics ===") + print(f"PSNR: {avg_psnr:.2f} dB (per frame: {psnr_values})") + print(f"MSE: {avg_mse:.6f} (per frame: {mse_values})") + print(f"SSIM: {avg_ssim:.4f} (per frame: {ssim_values})") + + # Save metrics to file + metrics_file = os.path.join(args.output_dir, f"metrics_video_{args.video_index}.txt") + with open(metrics_file, 'w') as f: + f.write(f"Video Index: {args.video_index}\n") + f.write(f"Video Path: {video_path}\n") + f.write(f"Caption: {caption}\n") + f.write(f"\n=== Metrics ===\n") + f.write(f"Average PSNR: {avg_psnr:.2f} dB\n") + f.write(f"Average MSE: {avg_mse:.6f}\n") + f.write(f"Average SSIM: {avg_ssim:.4f}\n") + f.write(f"\nPer-frame PSNR: {psnr_values}\n") + f.write(f"Per-frame MSE: {mse_values}\n") + f.write(f"Per-frame SSIM: {ssim_values}\n") + + print(f"Saved metrics to: {metrics_file}") + + # Create side-by-side video + print("\nCreating side-by-side comparison video...") + video_output_path = os.path.join(args.output_dir, f"comparison_video_{args.video_index}.mp4") + create_side_by_side_video(original_video, reconstructed_video, video_output_path, fps=8) + + # Create comparison grid + print("Creating comparison grid...") + grid_output_path = os.path.join(args.output_dir, f"comparison_grid_video_{args.video_index}.png") + create_comparison_grid(original_video, reconstructed_video, grid_output_path, nrow=4) + + print(f"\n=== Test Complete ===") + print(f"Results saved to: {args.output_dir}") + print(f" - Metrics: {metrics_file}") + print(f" - Side-by-side video: {video_output_path}") + print(f" - Comparison grid: {grid_output_path}") + + +if __name__ == "__main__": + main() + diff --git a/Meissonic/train/test_cosmos_vqvae.sh b/Meissonic/train/test_cosmos_vqvae.sh new file mode 100644 index 0000000000000000000000000000000000000000..c6bdaea7cda7b6bdaa1e4a56de74443202793331 --- /dev/null +++ b/Meissonic/train/test_cosmos_vqvae.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Test script for Cosmos VQ-VAE performance + + +python train/test_cosmos_vqvae.py \ + --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ + --video_index 0 \ + --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ + --num_frames 17 \ + --height 128 \ + --width 128 \ + --output_dir "./cosmos_test_output" \ + --device cuda \ + --dtype float32 + +# python train/test_cosmos_vqvae.py \ +# --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ +# --video_index 0 \ +# --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ +# --num_frames 16 \ +# --height 480 \ +# --width 848 \ +# --output_dir "./cosmos_test_output" \ +# --device cuda \ +# --dtype float32 + + +# python train/test_cosmos_vqvae.py \ +# --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ +# --video_index 1 \ +# --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ +# --num_frames 16 \ +# --height 480 \ +# --width 848 \ +# --output_dir "./cosmos_test_output" \ +# --device cuda \ +# --dtype float32 + +# python train/test_cosmos_vqvae.py \ +# --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ +# --video_index 2 \ +# --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ +# --num_frames 16 \ +# --height 480 \ +# --width 848 \ +# --output_dir "./cosmos_test_output" \ +# --device cuda \ +# --dtype float32 + + +# python train/test_cosmos_vqvae.py \ +# --csv_path "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ +# --video_index 3 \ +# --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ +# --num_frames 16 \ +# --height 480 \ +# --width 848 \ +# --output_dir "./cosmos_test_output" \ +# --device cuda \ +# --dtype float32 + + diff --git a/Meissonic/train/train.sh b/Meissonic/train/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..6667ecd895a97862e05dba000d775a8849e54f13 --- /dev/null +++ b/Meissonic/train/train.sh @@ -0,0 +1,33 @@ +# bash it in root path +PYTHON_PATH='./' accelerate launch --multi_gpu --gpu_ids '0,1,2,3' --main_process_port 25011 --num_processes 4 train/train_meissonic.py \ + --output_dir "../CKPT_OUTPUT_PATH" \ + --train_batch_size 4 \ + --gradient_accumulation_steps 2 \ + --learning_rate 1e-4 \ + --max_grad_norm 10 \ + --pretrained_model_name_or_path "meissonflow/meissonic" \ + --text_encoder_architecture 'open_clip' \ + --pretrained_model_architecture 'Meissonic' \ + --training_from_scratch True \ + --instance_dataset 'DATA_TYPE' \ + --instance_data_dir '../parquets_father_dir/' \ + --resolution 1024 \ + --mixed_precision bf16 \ + --lr_scheduler constant \ + --use_8bit_adam \ + --dataloader_num_workers 64 \ + --validation_prompts \ + 'a boy' \ + 'A serene mountain landscape with towering snow-capped peaks, a crystal-clear blue lake reflecting the mountains, dense pine forests, and a vibrant orange sunrise illuminating the sky.' \ + 'A playful golden retriever puppy with a shiny coat, bounding through a meadow filled with colorful wildflowers, under a bright, clear blue sky.' \ + 'A bustling city street at night, illuminated by vibrant neon signs in various colors, with busy pedestrians, street vendors, and a light rain creating reflective puddles on the pavement.' \ + 'A majestic, medieval castle perched on a rugged cliffside, overlooking a vast, calm ocean at sunset, with the sky painted in hues of pink, orange, and purple.' \ + 'An elegant ballerina in a white tutu, dancing gracefully on a grand stage with ornate, gold-trimmed curtains, under a spotlight that casts a soft glow.' \ + 'A cozy, rustic log cabin nestled in a snow-covered forest, with smoke rising from the stone chimney, warm lights glowing from the windows, and a path of footprints leading to the front door.'\ + 'A Cute Cat' \ + 'A Snow Mountain'\ + --max_train_steps 100000 \ + --checkpointing_steps 1000 \ + --validation_steps 200 \ + --report_to 'wandb' \ + --logging_steps 10 diff --git a/Meissonic/train/train_mei_video.py b/Meissonic/train/train_mei_video.py new file mode 100644 index 0000000000000000000000000000000000000000..59f53b3d2517fe3b460895ab78d7611205f2c759 --- /dev/null +++ b/Meissonic/train/train_mei_video.py @@ -0,0 +1,1788 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import logging +import math +import os +from contextlib import nullcontext +from pathlib import Path +import sys +sys.path.append(os.getcwd()) +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from peft import LoraConfig +from peft.utils import get_peft_model_state_dict +from torch.utils.data import DataLoader, default_collate +from torchvision import transforms +# Video training uses T5/UMT5, not CLIP +import diffusers.optimization +# EMA not used for video training +from src.scheduler_video import Scheduler +# Video training only - no image scheduler needed +from diffusers.loaders import LoraLoaderMixin +from diffusers.utils import is_wandb_available +from src.pipeline_video import Pipeline as VideoPipeline +from torchvision.utils import save_image,make_grid +from datasets import load_dataset +from train.trainer_utils import save_checkpoint +from train.dataset_utils import VideoDataset, OpenVid1MDataset, PrecomputedFeatureDataset +from train.dataset_utils import tokenize_prompt, encode_prompt +# Transformer2DModel removed - video training only uses WanDiscreteVideoTransformer +from src.transformer_video import WanDiscreteVideoTransformer, WanModel +from src.pipeline_video import CosmosVideoTokenizer +from transformers import T5Tokenizer, T5EncoderModel + +if is_wandb_available(): + import wandb + wandb.login(key="a96e0066098e5f64211a77b604ba2b1dd7bd7834") + +logger = get_logger(__name__, log_level="INFO") + +import torch._dynamo +torch._dynamo.config.verbose = True + +# Optionally suppress errors to fall back to eager execution +torch._dynamo.config.suppress_errors = True + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--text_encoder_architecture", + type=str, + default="open_clip", + required=False, + help="The architecture of the text encoder. For video training, must be 'umt5-base', 'umt5-xxl', or 't5'", + ) + parser.add_argument( + "--instance_dataset", + type=str, + default=None, + required=False, + help="The dataset to use for training. One of ['MSCOCO600K', 'PickaPicV2']", + ) + parser.add_argument( + "--training_from_scratch", + type=bool, + default=True, + required=False + ) + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) + parser.add_argument( + "--instance_data_dataset", + type=str, + default=None, + required=False, + help="A Hugging Face dataset containing the training images", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--instance_data_image", type=str, default=None, required=False, help="A single training image" + ) + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=4, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process. " + "Recommended: 4-8 for video loading. Set to 0 if you encounter issues with multiprocessing." + ), + ) + parser.add_argument( + "--dataloader_prefetch_factor", + type=int, + default=2, + help=( + "Number of batches loaded in advance by each worker. Higher values can improve GPU utilization " + "but use more memory. Default: 2." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") + parser.add_argument("--ema_decay", type=float, default=0.9999) + parser.add_argument("--ema_update_after_step", type=int, default=0) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument( + "--output_dir", + type=str, + default="muse_training", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. " + "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference." + "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components." + "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step" + "instructions." + ), + ) + parser.add_argument( + "--logging_steps", + type=int, + default=50, + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more details" + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=0.0003, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`" + " and logging the images." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="wandb", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument("--validation_prompts", type=str, nargs="*") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument("--split_vae_encode", type=int, required=False, default=None) + parser.add_argument("--min_masking_rate", type=float, default=0.0) + parser.add_argument("--cond_dropout_prob", type=float, default=0.0) + parser.add_argument("--max_grad_norm", default=50.0, type=float, help="Max gradient norm.", required=False) + parser.add_argument("--use_lora", action="store_true", help="Fine tune the model using LoRa") + parser.add_argument("--text_encoder_use_lora", action="store_true", help="Fine tune the model using LoRa") + parser.add_argument("--lora_r", default=16, type=int) + parser.add_argument("--lora_alpha", default=32, type=int) + parser.add_argument("--lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+") + parser.add_argument("--text_encoder_lora_r", default=16, type=int) + parser.add_argument("--text_encoder_lora_alpha", default=32, type=int) + parser.add_argument("--text_encoder_lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+") + parser.add_argument("--train_text_encoder", action="store_true") + parser.add_argument("--image_key", type=str, required=False) + parser.add_argument("--prompt_key", type=str, required=False) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument("--prompt_prefix", type=str, required=False, default=None) + + # Video training specific arguments + # Video training only - model_type is always "video" + parser.add_argument( + "--num_frames", + type=int, + default=16, + help="Number of frames in the video (for video training only)", + ) + parser.add_argument( + "--video_height", + type=int, + default=480, + help="Height of the video in pixels (for video training only)", + ) + parser.add_argument( + "--video_width", + type=int, + default=848, + help="Width of the video in pixels (for video training only)", + ) + parser.add_argument( + "--video_tokenizer_model_id", + type=str, + default="Cosmos-1.0-Tokenizer-DV8x16x16", + help="HuggingFace model ID for Cosmos video tokenizer (for video training only)", + ) + parser.add_argument( + "--wan_pretrained_path", + type=str, + default=None, + help="Path or HuggingFace model ID to Wan pretrained weights. If provided, will load Wan weights into the backbone.", + ) + parser.add_argument( + "--freeze_wan_backbone", + action="store_true", + help="Freeze Wan backbone weights (set lr=0). If set, --wan_backbone_lr_ratio is ignored.", + ) + parser.add_argument( + "--wan_backbone_lr_ratio", + type=float, + default=0.1, + help="Learning rate ratio for Wan backbone relative to other parts (token_embedding, logits_head). Default: 0.1 (backbone lr = base_lr * 0.1). Ignored if --freeze_wan_backbone is set.", + ) + parser.add_argument( + "--use_precomputed_features", + action="store_true", + help="Use pre-extracted features (video codes and text embeddings) instead of encoding on-the-fly.", + ) + parser.add_argument( + "--features_dir", + type=str, + default=None, + help="Directory containing pre-extracted features (required if --use_precomputed_features is set).", + ) + parser.add_argument( + "--empty_embeds_path", + type=str, + default=None, + help="Path to pre-extracted empty_embeds .pt file (required if --use_precomputed_features and --cond_dropout_prob > 0).", + ) + + args = parser.parse_args() + + # Validate precomputed features arguments + if args.use_precomputed_features: + if args.features_dir is None: + raise ValueError("--features_dir is required when --use_precomputed_features is set") + if not os.path.exists(args.features_dir): + raise ValueError(f"Features directory not found: {args.features_dir}") + # Check if empty_embeds is needed + if args.cond_dropout_prob > 0.0: + # Try to get empty_embeds_path from metadata.json if not provided + metadata_file = os.path.join(args.features_dir, "metadata.json") + if args.empty_embeds_path is None and os.path.exists(metadata_file): + import json + with open(metadata_file, 'r') as f: + metadata = json.load(f) + if metadata.get("empty_embeds_path"): + args.empty_embeds_path = os.path.join(args.features_dir, metadata["empty_embeds_path"]) + # Use print instead of logger since Accelerator is not initialized yet + print(f"Found empty_embeds_path in metadata: {args.empty_embeds_path}") + + if args.empty_embeds_path is None: + raise ValueError( + "--empty_embeds_path is required when --use_precomputed_features is set " + "and --cond_dropout_prob > 0.0. " + "Please run extract_features.py with --extract_text to generate the empty_embeds file." + ) + if not os.path.exists(args.empty_embeds_path): + raise ValueError(f"Empty embeds file not found: {args.empty_embeds_path}") + + if args.report_to == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + + num_datasources = sum( + [x is not None for x in [args.instance_data_dir, args.instance_data_image, args.instance_data_dataset]] + ) + + # if num_datasources != 1: + # raise ValueError( + # "provide one and only one of `--instance_data_dir`, `--instance_data_image`, or `--instance_data_dataset`" + # ) + + if args.instance_data_dir is not None: + if not os.path.exists(args.instance_data_dir): + raise ValueError(f"Does not exist: `--args.instance_data_dir` {args.instance_data_dir}") + + if args.instance_data_image is not None: + if not os.path.exists(args.instance_data_image): + raise ValueError(f"Does not exist: `--args.instance_data_image` {args.instance_data_image}") + + if args.instance_data_dataset is not None and (args.image_key is None or args.prompt_key is None): + raise ValueError("`--instance_data_dataset` requires setting `--image_key` and `--prompt_key`") + + return args + +# _prepare_latent_image_ids removed - not used for video training + +def safe_unwrap_model(model, accelerator): + """ + Safely unwrap model from accelerate/distributed wrapper, handling torch.compile. + + The unwrapping order for a compiled + distributed model: + 1. DistributedDataParallel (DDP) → wraps OptimizedModule + 2. OptimizedModule (torch.compile) → has _orig_mod attribute + 3. Original model (WanDiscreteVideoTransformer) + + Args: + model: The model (may be wrapped by accelerate and/or torch.compile) + accelerator: Accelerator instance + + Returns: + The unwrapped model + """ + unwrapped = model + + # Step 1: Unwrap DDP/accelerate wrapper + try: + unwrapped = accelerator.unwrap_model(model) + except (KeyError, AttributeError): + pass + + # Step 2: Handle DDP directly if accelerator.unwrap_model didn't work + if hasattr(unwrapped, 'module'): + unwrapped = unwrapped.module + + # Step 3: Unwrap torch.compile (OptimizedModule has _orig_mod) + if hasattr(unwrapped, '_orig_mod'): + unwrapped = unwrapped._orig_mod + + # Step 4: Handle nested _orig_mod (in case of multiple compile calls) + while hasattr(unwrapped, '_orig_mod'): + unwrapped = unwrapped._orig_mod + + return unwrapped + +def main(args): + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_config=accelerator_project_config, + ) + + if accelerator.is_main_process: + os.makedirs(args.output_dir, exist_ok=True) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + + if accelerator.is_main_process: + accelerator.init_trackers("meissonic", config=vars(copy.deepcopy(args))) + + if args.seed is not None: + set_seed(args.seed) + + # Initialize text encoder and tokenizer for video training (T5/UMT5 only) + # Skip loading if using precomputed features (will load only during validation) + text_encoder = None + tokenizer = None + + if args.use_precomputed_features: + logger.info("Using precomputed features - skipping text encoder and video tokenizer loading during training") + logger.info("Text encoder and video tokenizer will be loaded only during validation/inference") + + # Get mask_token_id and codebook_size from metadata (required) + metadata_file = os.path.join(args.features_dir, "metadata.json") + if not os.path.exists(metadata_file): + raise ValueError(f"Metadata file not found: {metadata_file}. Please ensure features were extracted with extract_features.py.") + + import json + with open(metadata_file, 'r') as f: + metadata = json.load(f) + + codebook_size = metadata.get("codebook_size") + mask_token_id = metadata.get("mask_token_id") + + if codebook_size is None or mask_token_id is None: + raise ValueError( + f"codebook_size and mask_token_id must be in metadata.json. " + f"Found: codebook_size={codebook_size}, mask_token_id={mask_token_id}. " + f"Please re-run extract_features.py to ensure metadata is complete." + ) + + logger.info(f"Loaded from metadata: codebook_size={codebook_size}, mask_token_id={mask_token_id}") + + # Create a minimal object with just the attributes we need + class MinimalTokenizer: + def __init__(self, mask_token_id, codebook_size): + self.mask_token_id = mask_token_id + self.codebook_size = codebook_size + video_tokenizer = MinimalTokenizer(mask_token_id, codebook_size) + logger.info(f"Minimal tokenizer created: mask_token_id={mask_token_id}, codebook_size={codebook_size}") + else: + # Load text encoder and tokenizer normally + if args.text_encoder_architecture in ["umt5-base", "umt5-xxl", "t5"]: + if args.resume_from_checkpoint: + text_encoder = T5EncoderModel.from_pretrained( + args.resume_from_checkpoint, subfolder="text_encoder", variant=args.variant + ) + tokenizer = T5Tokenizer.from_pretrained( + args.resume_from_checkpoint, subfolder="tokenizer", variant=args.variant + ) + else: + # Map architecture to model ID + if args.text_encoder_architecture == "umt5-base": + model_id = "google/umt5-base" + elif args.text_encoder_architecture == "umt5-xxl": + model_id = "google/umt5-xxl" + elif args.text_encoder_architecture == "t5": + model_id = "t5-base" # or "google/t5-v1_1-base" depending on your needs + else: + raise ValueError(f"Unknown text encoder architecture: {args.text_encoder_architecture}") + + text_encoder = T5EncoderModel.from_pretrained(model_id) + tokenizer = T5Tokenizer.from_pretrained(model_id) + logger.info(f"Loaded text encoder: {model_id} (d_model={text_encoder.config.d_model})") + + # Initialize video tokenizer for video training + device = accelerator.device + dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + dtype = torch.bfloat16 + + video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=device, + dtype=dtype + ) + video_tokenizer.requires_grad_(False) + + if not args.use_precomputed_features and args.train_text_encoder: + if args.text_encoder_use_lora: + lora_config = LoraConfig( + r=args.text_encoder_lora_r, + lora_alpha=args.text_encoder_lora_alpha, + target_modules=args.text_encoder_lora_target_modules, + ) + text_encoder.add_adapter(lora_config) + text_encoder.train() + text_encoder.requires_grad_(True) + else: + text_encoder.eval() + text_encoder.requires_grad_(False) + + # Initialize video transformer model + if args.training_from_scratch: + # Calculate compressed dimensions based on Cosmos tokenizer + # Cosmos compresses: F' = F // 8, H' = H // 16, W' = W // 16 + # However, actual encoding may have slight variations due to padding/rounding + # So we test with a dummy video to get the exact dimensions + if args.use_precomputed_features: + # For precomputed features, get dimensions from metadata or sample file + logger.info("Getting compressed dimensions from precomputed features...") + metadata_file = os.path.join(args.features_dir, "metadata.json") + + # Try to get dimensions from metadata first + F_prime, H_prime, W_prime = None, None, None + if os.path.exists(metadata_file): + import json + with open(metadata_file, 'r') as f: + metadata = json.load(f) + # Check if metadata has sample shape info + if 'samples' in metadata and len(metadata['samples']) > 0: + sample = metadata['samples'][0] + if 'video_code_shape' in sample: + shape = sample['video_code_shape'] + if len(shape) == 3: # [F', H', W'] + F_prime, H_prime, W_prime = shape[0], shape[1], shape[2] + logger.info(f"Got dimensions from metadata: F'={F_prime}, H'={H_prime}, W'={W_prime}") + + # If not in metadata, load a sample file + # if F_prime is None or H_prime is None or W_prime is None: + # logger.info("Loading a sample file to get dimensions...") + # # Try to find the first available sample + # video_codes_dir = os.path.join(args.features_dir, "video_codes") + # sample_path = None + # for level1 in range(1000): + # level1_dir = os.path.join(video_codes_dir, f"{level1:03d}") + # if not os.path.exists(level1_dir): + # continue + # for level2 in range(1000): + # level2_dir = os.path.join(level1_dir, f"{level2:03d}") + # if not os.path.exists(level2_dir): + # continue + # for level3 in range(1000): + # level3_dir = os.path.join(level2_dir, f"{level3:03d}") + # if not os.path.exists(level3_dir): + # continue + # # Find first .npy file + # for filename in os.listdir(level3_dir): + # if filename.endswith('.npy'): + # sample_path = os.path.join(level3_dir, filename) + # break + # if sample_path: + # break + # if sample_path: + # break + # if sample_path: + # break + + # if sample_path: + # import numpy as np + # sample_tokens = np.load(sample_path) # [F', H', W'] + # F_prime, H_prime, W_prime = sample_tokens.shape[0], sample_tokens.shape[1], sample_tokens.shape[2] + # logger.info(f"Got dimensions from sample file: F'={F_prime}, H'={H_prime}, W'={W_prime}") + # else: + # raise FileNotFoundError(f"Could not find any sample files in {video_codes_dir} to determine dimensions") + else: + # For non-precomputed features, use tokenizer to encode dummy video + dummy_video = torch.zeros(1, 3, args.num_frames, args.video_height, args.video_width, + device=accelerator.device, dtype=torch.float32) + with torch.no_grad(): + dummy_tokens = video_tokenizer.encode(dummy_video) # [1, F', H', W'] + F_prime, H_prime, W_prime = dummy_tokens.shape[1], dummy_tokens.shape[2], dummy_tokens.shape[3] + logger.info(f"Actual compressed dimensions from tokenizer: F'={F_prime}, H'={H_prime}, W'={W_prime}") + logger.info(f"Theoretical dimensions: F'={args.num_frames // video_tokenizer.t_downsample}, " + f"H'={args.video_height // video_tokenizer.h_downsample}, " + f"W'={args.video_width // video_tokenizer.w_downsample}") + + # Get text encoder dimension + if args.use_precomputed_features: + # For precomputed features, get text_dim from metadata or use default + text_dim_actual = None + metadata_file = os.path.join(args.features_dir, "metadata.json") + if os.path.exists(metadata_file): + import json + with open(metadata_file, 'r') as f: + metadata = json.load(f) + # Try to get from a sample + if 'samples' in metadata and len(metadata['samples']) > 0: + sample = metadata['samples'][0] + if 'text_embedding_shape' in sample: + shape = sample['text_embedding_shape'] + if len(shape) == 2: # [L, D] + text_dim_actual = shape[1] + logger.info(f"Got text_dim from metadata: {text_dim_actual}") + + # If not found, use default based on architecture + if text_dim_actual is None: + if args.text_encoder_architecture == "umt5-base": + text_dim_actual = 768 + elif args.text_encoder_architecture == "umt5-xxl": + text_dim_actual = 4096 + elif args.text_encoder_architecture == "t5": + text_dim_actual = 768 + else: + text_dim_actual = 768 # default + logger.info(f"Using default text_dim for {args.text_encoder_architecture}: {text_dim_actual}") + else: + text_dim_actual = text_encoder.config.d_model + + # If Wan pretrained path is provided, load config from it first + wan_config = None + if args.wan_pretrained_path is not None: + logger.info(f"Loading Wan config from: {args.wan_pretrained_path}") + try: + # Try to load WanModel config + try: + wan_backbone_temp = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder=None, + low_cpu_mem_usage=True, + device_map=None + ) + wan_config = wan_backbone_temp.config + del wan_backbone_temp + except: + try: + wan_backbone_temp = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder="backbone", + low_cpu_mem_usage=True, + device_map=None + ) + wan_config = wan_backbone_temp.config + del wan_backbone_temp + except: + # Try loading config.json directly + import json + config_path = os.path.join(args.wan_pretrained_path, "config.json") + if os.path.exists(config_path): + with open(config_path, 'r') as f: + wan_config_dict = json.load(f) + # Create a simple config object + from types import SimpleNamespace + wan_config = SimpleNamespace(**wan_config_dict) + else: + logger.warning(f"Could not find config in {args.wan_pretrained_path}, using default values") + + if wan_config is not None: + logger.info(f"Loaded Wan config: dim={getattr(wan_config, 'dim', 'N/A')}, " + f"ffn_dim={getattr(wan_config, 'ffn_dim', 'N/A')}, " + f"num_layers={getattr(wan_config, 'num_layers', 'N/A')}, " + f"num_heads={getattr(wan_config, 'num_heads', 'N/A')}") + except Exception as e: + logger.warning(f"Failed to load Wan config: {e}, using default values") + + # Use Wan config if available, otherwise use defaults + dim = getattr(wan_config, 'dim', 2048) if wan_config else 2048 + ffn_dim = getattr(wan_config, 'ffn_dim', 8192) if wan_config else 8192 + num_layers = getattr(wan_config, 'num_layers', 32) if wan_config else 32 + num_heads = getattr(wan_config, 'num_heads', 16) if wan_config else 16 + freq_dim = getattr(wan_config, 'freq_dim', 256) if wan_config else 256 + in_dim = getattr(wan_config, 'in_dim', 16) if wan_config else 16 + out_dim = getattr(wan_config, 'out_dim', 16) if wan_config else 16 + + # text_dim: Use Wan's text_dim if available, but warn if it doesn't match text encoder + wan_text_dim = getattr(wan_config, 'text_dim', None) if wan_config else None + if wan_text_dim is not None and wan_text_dim != text_dim_actual: + logger.warning(f"Wan config text_dim ({wan_text_dim}) doesn't match text encoder dimension ({text_dim_actual}). " + f"Will use text encoder dimension and skip loading text_embedding weights.") + text_dim_for_model = text_dim_actual + else: + # Use Wan's text_dim if it matches, or use text encoder dimension + text_dim_for_model = wan_text_dim if wan_text_dim is not None else text_dim_actual + + model = WanDiscreteVideoTransformer( + codebook_size=video_tokenizer.codebook_size, + vocab_size=video_tokenizer.codebook_size + 1, + num_frames=F_prime, + height=H_prime, + width=W_prime, + model_type='t2v', + patch_size=(1, 2, 2), + text_len=512, + in_dim=in_dim, + dim=dim, + ffn_dim=ffn_dim, + freq_dim=freq_dim, + text_dim=text_dim_for_model, + out_dim=out_dim, + num_heads=num_heads, + num_layers=num_layers, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6 + ) + + # Load Wan pretrained weights into backbone if provided + if args.wan_pretrained_path is not None: + logger.info(f"Loading Wan pretrained weights from: {args.wan_pretrained_path}") + try: + # Check if it's a local path or HuggingFace model ID + is_local_path = os.path.exists(args.wan_pretrained_path) and os.path.isdir(args.wan_pretrained_path) + + if is_local_path: + # Local path: find the state dict file + state_dict_path = None + possible_paths = [ + os.path.join(args.wan_pretrained_path, "diffusion_pytorch_model.safetensors"), + os.path.join(args.wan_pretrained_path, "diffusion_pytorch_model.bin"), + os.path.join(args.wan_pretrained_path, "pytorch_model.bin"), + os.path.join(args.wan_pretrained_path, "model.safetensors"), + ] + for p in possible_paths: + if os.path.exists(p): + state_dict_path = p + break + + if state_dict_path is None: + raise FileNotFoundError(f"Could not find state dict in {args.wan_pretrained_path}") + + logger.info(f"Loading weights from local path: {state_dict_path}") + + # Load state dict from local file + if state_dict_path.endswith('.safetensors'): + from safetensors import safe_open + wan_state_dict = {} + with safe_open(state_dict_path, framework="pt", device="cpu") as f: + for k in f.keys(): + wan_state_dict[k] = f.get_tensor(k) + else: + wan_state_dict = torch.load(state_dict_path, map_location="cpu") + else: + # HuggingFace model ID: try to load using from_pretrained + logger.info(f"Loading weights from HuggingFace Hub: {args.wan_pretrained_path}") + try: + # Try loading as WanModel first + temp_model = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder=None, + low_cpu_mem_usage=False, + device_map=None + ) + wan_state_dict = temp_model.state_dict() + del temp_model + except: + # If that fails, try with 'backbone' subfolder + try: + temp_model = WanModel.from_pretrained( + args.wan_pretrained_path, + subfolder="backbone", + low_cpu_mem_usage=False, + device_map=None + ) + wan_state_dict = temp_model.state_dict() + del temp_model + except: + # Last resort: try to download and load state dict directly + from huggingface_hub import hf_hub_download + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + # Try different possible filenames + possible_files = [ + "diffusion_pytorch_model.safetensors", + "diffusion_pytorch_model.bin", + "pytorch_model.bin", + "model.safetensors", + ] + state_dict_path = None + for filename in possible_files: + try: + state_dict_path = hf_hub_download( + repo_id=args.wan_pretrained_path, + filename=filename, + cache_dir=tmpdir + ) + break + except: + continue + + if state_dict_path is None: + raise FileNotFoundError( + f"Could not find state dict file in HuggingFace model {args.wan_pretrained_path}" + ) + + # Load state dict + if state_dict_path.endswith('.safetensors'): + from safetensors import safe_open + wan_state_dict = {} + with safe_open(state_dict_path, framework="pt", device="cpu") as f: + for k in f.keys(): + wan_state_dict[k] = f.get_tensor(k) + else: + wan_state_dict = torch.load(state_dict_path, map_location="cpu") + + # Remove text_embedding weights if input dimension doesn't match + # This is necessary when using a different text encoder (e.g., UMT5-base with 768 dim + # vs Wan's original 4096 dim) + # Check the first text_embedding layer's input dimension (text_embedding.0.weight shape[1]) + text_embedding_key = 'text_embedding.0.weight' + if text_embedding_key in wan_state_dict: + pretrained_text_dim = wan_state_dict[text_embedding_key].shape[1] # Input dimension + model_text_dim = model.backbone.text_embedding[0].weight.shape[1] # Model's expected input dimension + + if pretrained_text_dim != model_text_dim: + # Remove all text_embedding related keys + keys_to_remove = [k for k in wan_state_dict.keys() if 'text_embedding' in k] + for k in keys_to_remove: + del wan_state_dict[k] + logger.info(f"Removed {len(keys_to_remove)} text_embedding keys due to input dimension mismatch " + f"(pretrained: {pretrained_text_dim}, model: {model_text_dim})") + + # Load into model's backbone + missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) + + # Log results + if missing_keys: + # Filter out expected missing keys (text_embedding if removed) + actual_missing = [k for k in missing_keys if 'text_embedding' not in k] + if actual_missing: + logger.warning(f"Missing keys when loading Wan weights: {actual_missing[:10]}..." + if len(actual_missing) > 10 else f"Missing keys: {actual_missing}") + else: + logger.info(f"Only text_embedding keys are missing (expected due to text_dim mismatch)") + if unexpected_keys: + logger.warning(f"Unexpected keys when loading Wan weights: {unexpected_keys[:10]}..." + if len(unexpected_keys) > 10 else f"Unexpected keys: {unexpected_keys}") + + logger.info("✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)") + + except Exception as e: + logger.warning(f"Failed to load Wan pretrained weights: {e}") + import traceback + traceback.print_exc() + logger.warning("Continuing with random initialization") + else: + # Load from pretrained checkpoint + model = WanDiscreteVideoTransformer.from_pretrained( + args.pretrained_model_name_or_path, subfolder="transformer", low_cpu_mem_usage=False, device_map=None + ) + + # Save vocab_size before torch.compile (for use in training loop) + # This avoids issues with accelerate.unwrap_model when using torch.compile + vocab_size = model.vocab_size + + # Convert model to correct dtype before torch.compile + # This ensures all layers (especially text_embedding which is randomly initialized) are on the right dtype + if accelerator.mixed_precision == "fp16": + model = model.to(dtype=torch.float16) + elif accelerator.mixed_precision == "bf16": + model = model.to(dtype=torch.bfloat16) + # else: keep float32 + + model = torch.compile(model) + + if args.use_lora: + lora_config = LoraConfig( + r=args.lora_r, + lora_alpha=args.lora_alpha, + target_modules=args.lora_target_modules, + ) + model.add_adapter(lora_config) + + model.train() + + # Freeze Wan backbone if requested + if args.freeze_wan_backbone: + for name, param in model.named_parameters(): + if 'backbone' in name: + param.requires_grad = False + logger.info("Wan backbone parameters are frozen (requires_grad=False)") + + if args.gradient_checkpointing: + model.enable_gradient_checkpointing() + if args.train_text_encoder and not args.use_precomputed_features: + # Only enable gradient checkpointing for text_encoder if it's loaded + text_encoder.gradient_checkpointing_enable() + + # EMA is not used for video training + ema = None + + def save_model_hook(models, weights, output_dir): + if accelerator.is_main_process: + transformer_lora_layers_to_save = None + text_encoder_lora_layers_to_save = None + + for model_ in models: + # Unwrap model_ to get the actual model type (handles torch.compile wrapping) + unwrapped_model_ = safe_unwrap_model(model_, accelerator) + + # Use class name comparison for more robust type checking + # This handles cases where the same class might be loaded from different modules + model_class_name = unwrapped_model_.__class__.__name__ + + if model_class_name == "WanDiscreteVideoTransformer": + if args.use_lora: + transformer_lora_layers_to_save = get_peft_model_state_dict(model_) + else: + # Unwrap before saving to avoid torch.compile issues + unwrapped_model_.save_pretrained(os.path.join(output_dir, "transformer")) + elif model_class_name in ["T5EncoderModel", "T5Model"]: + if args.text_encoder_use_lora: + text_encoder_lora_layers_to_save = get_peft_model_state_dict(model_) + else: + # Unwrap before saving to avoid torch.compile issues + unwrapped_model_.save_pretrained(os.path.join(output_dir, "text_encoder")) + else: + raise ValueError(f"unexpected save model: {model_.__class__}, unwrapped: {unwrapped_model_.__class__.__name__}") + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None: + LoraLoaderMixin.save_lora_weights( + output_dir, + unet_lora_layers=transformer_lora_layers_to_save, + text_encoder_lora_layers=text_encoder_lora_layers_to_save, + ) + + # EMA not used for video training + + def load_model_hook(models, input_dir): + transformer = None + text_encoder_ = None + + # this part is added for keep consistency when add model.compile() in the model + def adap_compile(ori_dict):#add '_orig_mod.' to each key + new_dict = {} + for k,v in ori_dict.items(): + new_dict['_orig_mod.'+k] = v + return new_dict + + while len(models) > 0: + model_ = models.pop() + + # Unwrap model to get the actual class name + unwrapped_model_ = safe_unwrap_model(model_, accelerator) + model_class_name = unwrapped_model_.__class__.__name__ + + if model_class_name == "WanDiscreteVideoTransformer": + if args.use_lora: + transformer = model_ + else: + load_model = WanDiscreteVideoTransformer.from_pretrained(os.path.join(input_dir, "transformer"), low_cpu_mem_usage=False, device_map=None) + model_.load_state_dict(adap_compile(load_model.state_dict())) + del load_model + elif model_class_name in ["T5EncoderModel", "T5Model"]: + if args.text_encoder_use_lora: + text_encoder_ = model_ + else: + try: + load_model = T5EncoderModel.from_pretrained(os.path.join(input_dir, "text_encoder")) + model_.load_state_dict(load_model.state_dict()) + except: + print('Not found text-encoder model in current folder. Loading default UMT5-base.') + load_model = T5EncoderModel.from_pretrained("google/umt5-base") + model_.load_state_dict(load_model.state_dict()) + del load_model + else: + raise ValueError(f"unexpected load model: {model_.__class__}, unwrapped: {model_class_name}") + + if transformer is not None or text_encoder_ is not None: + lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + LoraLoaderMixin.load_lora_into_text_encoder( + lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_ + ) + LoraLoaderMixin.load_lora_into_transformer( + lora_state_dict, network_alphas=network_alphas, transformer=transformer + ) + + # EMA not used for video training + + accelerator.register_load_state_pre_hook(load_model_hook) + accelerator.register_save_state_pre_hook(save_model_hook) + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + ) + + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`" + ) + + optimizer_cls = bnb.optim.AdamW8bit + else: + optimizer_cls = torch.optim.AdamW + + # Separate Wan backbone parameters from other parameters (token_embedding, logits_head) + # This allows different learning rates for backbone vs head/tail + backbone_params = [] + other_params = [] + + for name, param in model.named_parameters(): + if 'backbone' in name: + backbone_params.append((name, param)) + else: + other_params.append((name, param)) + + # Log parameter counts + backbone_param_count = sum(p.numel() for _, p in backbone_params) + other_param_count = sum(p.numel() for _, p in other_params) + total_param_count = sum(p.numel() for _, p in model.named_parameters()) + logger.info(f"Parameter counts: backbone={backbone_param_count:,}, other={other_param_count:,}, total={total_param_count:,}") + + # no decay on bias and layernorm and embedding + no_decay = ["bias", "layer_norm.weight", "mlm_ln.weight", "embeddings.weight"] + + # Calculate backbone lr + if args.freeze_wan_backbone: + backbone_lr = 0.0 + logger.info("Wan backbone is frozen (lr=0)") + else: + backbone_lr = args.learning_rate * args.wan_backbone_lr_ratio + logger.info(f"Wan backbone lr = {backbone_lr:.6f} (base_lr * {args.wan_backbone_lr_ratio})") + + logger.info(f"Other parts (token_embedding, logits_head) lr = {args.learning_rate:.6f}") + + # Group parameters: backbone and other parts, each with decay and no_decay + optimizer_grouped_parameters = [] + + # Backbone parameters with weight decay + if backbone_params: + backbone_with_decay = [p for n, p in backbone_params if not any(nd in n for nd in no_decay)] + if backbone_with_decay: + optimizer_grouped_parameters.append({ + "params": backbone_with_decay, + "lr": backbone_lr, + "weight_decay": args.adam_weight_decay, + }) + + # Backbone parameters without weight decay + backbone_no_decay = [p for n, p in backbone_params if any(nd in n for nd in no_decay)] + if backbone_no_decay: + optimizer_grouped_parameters.append({ + "params": backbone_no_decay, + "lr": backbone_lr, + "weight_decay": 0.0, + }) + + # Other parameters (token_embedding, logits_head) with weight decay + if other_params: + other_with_decay = [p for n, p in other_params if not any(nd in n for nd in no_decay)] + if other_with_decay: + optimizer_grouped_parameters.append({ + "params": other_with_decay, + "lr": args.learning_rate, + "weight_decay": args.adam_weight_decay, + }) + + # Other parameters without weight decay + other_no_decay = [p for n, p in other_params if any(nd in n for nd in no_decay)] + if other_no_decay: + optimizer_grouped_parameters.append({ + "params": other_no_decay, + "lr": args.learning_rate, + "weight_decay": 0.0, + }) + + if args.train_text_encoder and not args.use_precomputed_features: + # Only add text_encoder to optimizer if it's loaded (not using precomputed features) + optimizer_grouped_parameters.append( + {"params": text_encoder.parameters(), "weight_decay": args.adam_weight_decay} + ) + + optimizer = optimizer_cls( + optimizer_grouped_parameters, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + logger.info("Creating dataloaders and lr_scheduler") + + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + # Video training datasets + if args.use_precomputed_features: + # Use pre-extracted features + logger.info(f"Using pre-extracted features from: {args.features_dir}") + dataset = PrecomputedFeatureDataset( + features_dir=args.features_dir, + ) + elif args.instance_dataset == 'OpenVid1MDataset': + # OpenVid1M dataset from CSV file + csv_path = args.instance_data_dir + if not os.path.exists(csv_path): + raise FileNotFoundError(f"CSV file not found: {csv_path}") + + # Video root directory: assume videos are in the same directory as CSV or in a 'video_reorg' subdirectory + csv_dir = os.path.dirname(csv_path) + # Try to find video directory + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + # Fallback: use CSV directory + video_root_dir = csv_dir + logger.warning(f"Video directory not found, using CSV directory: {video_root_dir}") + + dataset = OpenVid1MDataset( + csv_path=csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture, + prompt_prefix=args.prompt_prefix, + ) + elif args.instance_dataset == 'HuggingFaceDataset' or args.instance_dataset == 'VideoDataset': + dataset = VideoDataset( + hf_dataset=load_dataset(args.instance_data_dir, split="train"), + tokenizer=tokenizer, + video_key=args.image_key if args.image_key else "video", + prompt_key=args.prompt_key if args.prompt_key else "caption", + prompt_prefix=args.prompt_prefix, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture + ) + else: + raise ValueError(f"For video training, instance_dataset must be 'OpenVid1MDataset', 'HuggingFaceDataset' or 'VideoDataset', got '{args.instance_dataset}'") + + # Adjust DataLoader settings for precomputed features to reduce memory usage + if args.use_precomputed_features: + # For precomputed features, reduce prefetch to save memory + # Features are already extracted, so we don't need as much prefetching + prefetch_factor = min(args.dataloader_prefetch_factor, 1) if args.dataloader_num_workers > 0 else None + # Consider disabling pin_memory for precomputed features if memory is tight + # pin_memory=True is faster but uses more memory + pin_memory = True # Keep True for performance, but can be set to False if OOM persists + logger.info(f"Using precomputed features - DataLoader settings: prefetch_factor={prefetch_factor}, pin_memory={pin_memory}") + else: + prefetch_factor = args.dataloader_prefetch_factor if args.dataloader_num_workers > 0 else None + pin_memory = True + + train_dataloader = DataLoader( + dataset, + batch_size=args.train_batch_size, + shuffle=True, + num_workers=args.dataloader_num_workers, + collate_fn=default_collate, + pin_memory=pin_memory, + prefetch_factor=prefetch_factor, + persistent_workers=args.dataloader_num_workers > 0, # Keep workers alive between epochs + ) + train_dataloader.num_batches = len(train_dataloader) + + # Log dataloader configuration for performance monitoring + if accelerator.is_main_process: + logger.info(f"Dataloader configuration:") + logger.info(f" - num_workers: {args.dataloader_num_workers} (0 = single-threaded, recommended: 4-8 for video)") + logger.info(f" - prefetch_factor: {args.dataloader_prefetch_factor if args.dataloader_num_workers > 0 else 'N/A (num_workers=0)'}") + logger.info(f" - persistent_workers: {args.dataloader_num_workers > 0}") + logger.info(f" - pin_memory: True") + if args.dataloader_num_workers == 0: + logger.warning( + "⚠️ num_workers=0 may cause GPU underutilization. " + "Consider setting --dataloader_num_workers 4-8 to improve GPU utilization." + ) + + # Calculate max_train_steps if not provided + if args.max_train_steps is None: + # Default to 1 epoch if not specified + num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps) + args.max_train_steps = num_update_steps_per_epoch + logger.warning(f"max_train_steps not specified, defaulting to 1 epoch ({args.max_train_steps} steps)") + + lr_scheduler = diffusers.optimization.get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_training_steps=args.max_train_steps * accelerator.num_processes, + num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes, + ) + + logger.info("Preparing model, optimizer and dataloaders") + + if args.use_precomputed_features: + # Don't prepare text_encoder if using precomputed features + model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader + ) + elif args.train_text_encoder: + model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader, text_encoder + ) + else: + model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader + ) + + train_dataloader.num_batches = len(train_dataloader) + + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + if not args.use_precomputed_features: + if not args.train_text_encoder: + text_encoder.to(device=accelerator.device, dtype=weight_dtype) + # Video tokenizer is already on the correct device + else: + # For precomputed features, text_encoder is None, so skip + logger.info("Skipping text_encoder.to() - using precomputed features") + + # EMA not used for video training + + if not args.use_precomputed_features: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + # T5/UMT5 doesn't have cond_embeds, only encoder_hidden_states + empty_embeds, _ = encode_prompt( + text_encoder, tokenize_prompt(tokenizer, "", args.text_encoder_architecture).to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + empty_clip_embeds = None # Not used for T5 + + # Video training doesn't use instance_data_image + else: + # For precomputed features, load empty_embeds from file if needed + empty_clip_embeds = None # Not used for T5 + if args.cond_dropout_prob > 0.0: + if args.empty_embeds_path is None: + raise ValueError( + "--empty_embeds_path is required when --use_precomputed_features is set " + "and --cond_dropout_prob > 0.0" + ) + logger.info(f"Loading empty_embeds from: {args.empty_embeds_path}") + + # Load empty_embeds from .npy file (more space-efficient than .pt) + import numpy as np + empty_embeds_np = np.load(args.empty_embeds_path) # Load as numpy array + empty_embeds = torch.from_numpy(empty_embeds_np).to(dtype=weight_dtype) # Convert to tensor + + # Load metadata to verify compatibility + metadata_file = os.path.join(args.features_dir, "metadata.json") + if os.path.exists(metadata_file): + import json + with open(metadata_file, 'r') as f: + metadata = json.load(f) + logger.info(f"Empty embeds info from metadata: shape={metadata.get('empty_embeds_shape')}") + # Verify text encoder architecture matches + if metadata.get("text_encoder_architecture") != args.text_encoder_architecture: + logger.warning( + f"Text encoder architecture mismatch: " + f"empty_embeds was extracted with {metadata.get('text_encoder_architecture')}, " + f"but training uses {args.text_encoder_architecture}" + ) + + # Note: We don't move to device here, will move during training loop if needed + logger.info(f"Loaded empty_embeds: shape={empty_embeds.shape}, dtype={empty_embeds.dtype}") + else: + empty_embeds = None + logger.info("Skipping empty_embeds loading - cond_dropout_prob is 0.0") + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps) + # Afterwards we recalculate our number of training epochs. + # Note: We are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + # max_train_steps should already be set above if it was None + num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if num_update_steps_per_epoch > 0 else 1 + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num training steps = {args.max_train_steps}") + logger.info(f" Instantaneous batch size per device = { args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + + resume_from_checkpoint = args.resume_from_checkpoint + if resume_from_checkpoint: + if resume_from_checkpoint == "latest": + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + if len(dirs) > 0: + resume_from_checkpoint = os.path.join(args.output_dir, dirs[-1]) + else: + resume_from_checkpoint = None + + if resume_from_checkpoint is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + else: + accelerator.print(f"Resuming from checkpoint {resume_from_checkpoint}") + + if resume_from_checkpoint is None: + global_step = 0 + first_epoch = 0 + else: + accelerator.load_state(resume_from_checkpoint) + global_step = int(os.path.basename(resume_from_checkpoint).split("-")[1]) + first_epoch = global_step // num_update_steps_per_epoch + + # EMA not used for video training + + # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + for epoch in range(first_epoch, num_train_epochs): + for batch in train_dataloader: + with torch.no_grad(): + if args.use_precomputed_features: + # Use pre-extracted features + # Features are already on CPU with correct dtypes (video_codes: int32, text_embedding: float16/bfloat16) + # Move to device and convert dtype in one step to avoid intermediate copies + weight_dtype = { + "fp16": torch.float16, + "bf16": torch.bfloat16, + }.get(accelerator.mixed_precision, torch.float32) + + video_tokens = batch["video_codes"].to( + device=accelerator.device, + non_blocking=False, # CPU->GPU + ) # [B, F', H', W'], int32/int64, on GPU + + encoder_hidden_states = batch["text_embedding"].to( + device=accelerator.device, + dtype=weight_dtype, + non_blocking=False, + ) # [B, L, D], float16/bfloat16, on GPU + + batch_size = video_tokens.shape[0] + + # Delete batch after moving data to GPU to free CPU memory + del batch + + # Debug print (only on main process, first batch) + if accelerator.is_main_process and global_step == 0 and epoch == first_epoch: + print(f"[DEBUG] video_tokens: shape={video_tokens.shape}, dtype={video_tokens.dtype}, device={video_tokens.device}") + print(f"[DEBUG] encoder_hidden_states: shape={encoder_hidden_states.shape}, dtype={encoder_hidden_states.dtype}, device={encoder_hidden_states.device}") + else: + # Video training path - encode on-the-fly + video_values = batch["video"].to(accelerator.device, non_blocking=True) # [B, C, F, H, W] + batch_size = video_values.shape[0] + + # Encode video to discrete tokens using CosmosVideoTokenizer + split_batch_size = args.split_vae_encode if args.split_vae_encode is not None else batch_size + num_splits = math.ceil(batch_size / split_batch_size) + video_tokens = [] + for i in range(num_splits): + start_idx = i * split_batch_size + end_idx = min((i + 1) * split_batch_size, batch_size) + # video_values: [B, C, F, H, W] + tokens = video_tokenizer.encode(video_values[start_idx:end_idx]) # [B, F', H', W'] + video_tokens.append(tokens) + video_tokens = torch.cat(video_tokens, dim=0) # [B, F', H', W'] + + if "prompt_input_ids" in batch: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + + # Flatten video tokens for masking: [B, F', H', W'] -> [B, F'*H'*W'] + B, F_prime, H_prime, W_prime = video_tokens.shape + seq_len = F_prime * H_prime * W_prime + video_tokens_flat = video_tokens.view(B, seq_len) # [B, seq_len] + + timesteps = torch.rand(batch_size, device=video_tokens_flat.device) + mask_prob = torch.cos(timesteps * math.pi * 0.5) + mask_prob = mask_prob.clip(args.min_masking_rate) + + num_token_masked = (seq_len * mask_prob).round().clamp(min=1) + batch_randperm = torch.rand(batch_size, seq_len, device=video_tokens_flat.device).argsort(dim=-1) + mask = batch_randperm < num_token_masked.unsqueeze(-1) + + mask_id = video_tokenizer.mask_token_id # codebook_size + input_ids_flat = torch.where(mask, mask_id, video_tokens_flat) + labels_flat = torch.where(mask, video_tokens_flat, -100) + + # Reshape back to [B, F', H', W'] for model forward + input_ids = input_ids_flat.view(B, F_prime, H_prime, W_prime) + labels = labels_flat.view(B, F_prime, H_prime, W_prime) + + if args.cond_dropout_prob > 0.0: + assert encoder_hidden_states is not None + assert empty_embeds is not None, "empty_embeds must be loaded when cond_dropout_prob > 0.0" + + batch_size = encoder_hidden_states.shape[0] + + # Move empty_embeds to device if needed (for precomputed features case) + if empty_embeds.device != encoder_hidden_states.device: + empty_embeds = empty_embeds.to(encoder_hidden_states.device) + + mask = ( + torch.zeros((batch_size, 1, 1), device=encoder_hidden_states.device).float().uniform_(0, 1) + < args.cond_dropout_prob + ) + + empty_embeds_ = empty_embeds.expand(batch_size, -1, -1) + encoder_hidden_states = torch.where( + (encoder_hidden_states * mask).bool(), encoder_hidden_states, empty_embeds_ + ) + + # Handle cond_embeds dropout (only for CLIP, not for T5) + # For T5/UMT5, cond_embeds is None, so skip this step + + # Video tokens are already in [B, F', H', W'] format, no need to reshape + + if not args.use_precomputed_features and "prompt_input_ids" in batch: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + + # Train Step + with accelerator.accumulate(model): + # Video training: use WanDiscreteVideoTransformer + # vocab_size is already saved before torch.compile + + # Prepare timesteps: [B] -> [B] (scalar timesteps for video) + timesteps_tensor = (mask_prob * 1000).long().to(input_ids.device) + + # Ensure encoder_hidden_states is on correct dtype + # Note: For precomputed features, dtype is already converted above + if encoder_hidden_states is not None and not args.use_precomputed_features: + # Only convert if not using precomputed features (already converted above) + if accelerator.mixed_precision == "fp16" and encoder_hidden_states.dtype != torch.float16: + encoder_hidden_states = encoder_hidden_states.to(dtype=torch.float16) + elif accelerator.mixed_precision == "bf16" and encoder_hidden_states.dtype != torch.bfloat16: + encoder_hidden_states = encoder_hidden_states.to(dtype=torch.bfloat16) + elif accelerator.mixed_precision == "no" and encoder_hidden_states.dtype != torch.float32: + encoder_hidden_states = encoder_hidden_states.to(dtype=torch.float32) + + # Forward pass: input_ids is [B, F', H', W'], encoder_hidden_states is [B, L, D] + logits = model( + tokens=input_ids, # [B, F', H', W'] + timesteps=timesteps_tensor, # [B] + encoder_hidden_states=encoder_hidden_states, # [B, L, D] + y=None, + ) # Returns [B, vocab_size, F', H', W'] + + # Reshape logits and labels for loss computation + # logits: [B, vocab_size, F', H', W'] -> [B*F'*H'*W', vocab_size] + B, vocab_size, F_prime_logits, H_prime_logits, W_prime_logits = logits.shape + logits = logits.permute(0, 2, 3, 4, 1).reshape(B * F_prime_logits * H_prime_logits * W_prime_logits, vocab_size) + + # labels: [B, F', H', W'] - may have different dimensions due to patch/unpatch operations + # Crop labels to match logits dimensions if needed + B_labels, F_prime_labels, H_prime_labels, W_prime_labels = labels.shape + assert B == B_labels, f"Batch size mismatch: logits {B} vs labels {B_labels}" + + # Crop labels to match logits spatial dimensions + if F_prime_labels != F_prime_logits or H_prime_labels != H_prime_logits or W_prime_labels != W_prime_logits: + # Crop labels to match logits dimensions + labels = labels[:, :F_prime_logits, :H_prime_logits, :W_prime_logits] + + # labels: [B, F', H', W'] -> [B*F'*H'*W'] + labels_flat = labels.reshape(-1) + + # Convert to long (int64) for cross_entropy (required by CUDA kernel) + # video_tokens might be int32 from precomputed features + labels_flat = labels_flat.long() + + loss = F.cross_entropy( + logits, + labels_flat, + ignore_index=-100, + reduction="mean", + ) + + # Gather the losses across all processes for logging (if we use distributed training). + avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() + avg_masking_rate = accelerator.gather(mask_prob.repeat(args.train_batch_size)).mean() + + accelerator.backward(loss) + + if args.max_grad_norm is not None and accelerator.sync_gradients: + accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + + optimizer.zero_grad(set_to_none=True) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + # EMA not used for video training + + if (global_step + 1) % args.logging_steps == 0: + logs = { + "step_loss": avg_loss.item(), + "lr": lr_scheduler.get_last_lr()[0], + "avg_masking_rate": avg_masking_rate.item(), + } + accelerator.log(logs, step=global_step + 1) + + logger.info( + f"Step: {global_step + 1} " + f"Loss: {avg_loss.item():0.4f} " + f"LR: {lr_scheduler.get_last_lr()[0]:0.6f}" + ) + + if (global_step + 1) % args.checkpointing_steps == 0: + save_checkpoint(args, accelerator, global_step + 1, logger) + + if (global_step + 1) % args.validation_steps == 0 and accelerator.is_main_process: + # EMA not used for video training + + with torch.no_grad(): + logger.info("Generating videos for validation...") + + model.eval() + + # Load text encoder and video tokenizer for validation if using precomputed features + # Use different variable names to avoid shadowing global variables + val_text_encoder = None + val_tokenizer = None + val_video_tokenizer = None + + if args.use_precomputed_features: + logger.info("Loading text encoder and video tokenizer for validation...") + + # Load text encoder + if args.text_encoder_architecture == "umt5-base": + model_id = "google/umt5-base" + elif args.text_encoder_architecture == "umt5-xxl": + model_id = "google/umt5-xxl" + elif args.text_encoder_architecture == "t5": + model_id = "t5-base" + else: + raise ValueError(f"Unknown text encoder architecture: {args.text_encoder_architecture}") + + val_text_encoder = T5EncoderModel.from_pretrained(model_id) + val_tokenizer = T5Tokenizer.from_pretrained(model_id) + val_text_encoder.to(device=accelerator.device, dtype=weight_dtype) + val_text_encoder.eval() + val_text_encoder.requires_grad_(False) + + # Load video tokenizer + val_video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=accelerator.device, + dtype=weight_dtype + ) + val_video_tokenizer.requires_grad_(False) + val_video_tokenizer.eval() + + logger.info("Text encoder and video tokenizer loaded for validation") + else: + # Use global variables when not using precomputed features + val_text_encoder = text_encoder + val_tokenizer = tokenizer + val_video_tokenizer = video_tokenizer + if args.train_text_encoder: + val_text_encoder.eval() + + # Video pipeline validation + logger.info("Generating videos for validation...") + + # For video, create scheduler with mask_token_id + scheduler = Scheduler( + mask_token_id=val_video_tokenizer.mask_token_id, + masking_schedule="cosine" + ) + scheduler.set_timesteps(num_inference_steps=48, device=accelerator.device) + + # Get unwrapped transformer and ensure it's on correct dtype + unwrapped_transformer = safe_unwrap_model(model, accelerator) + # Ensure transformer is on the correct dtype (text_embedding was randomly initialized as float32) + unwrapped_transformer = unwrapped_transformer.to(dtype=weight_dtype) + + pipe = VideoPipeline( + tokenizer=val_tokenizer, + text_encoder=val_text_encoder, + transformer=unwrapped_transformer, + scheduler=scheduler, + video_tokenizer=val_video_tokenizer, + text_len=512, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + ) + + # Generate videos + try: + videos = pipe( + prompt=args.validation_prompts, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + guidance_scale=9.0, + num_inference_steps=48, + output_type="pil", + ).videos + + # Log videos to wandb (save first frame of each video) + if is_wandb_available(): + wandb_images = [] + for i, video in enumerate(videos): + if isinstance(video, list) and len(video) > 0: + first_frame = video[0] + elif isinstance(video, torch.Tensor): + first_frame = transforms.ToPILImage()(video[:, 0, :, :].clamp(0, 1)) + else: + first_frame = video + if first_frame is not None: + prompt_caption = args.validation_prompts[i] if i < len(args.validation_prompts) else f"video_{i}" + wandb_images.append(wandb.Image(first_frame, caption=prompt_caption)) + if wandb_images: + wandb.log({"generated_videos_first_frame": wandb_images}, step=global_step + 1) + + # Save video frames as grid + for i, video in enumerate(videos): + if isinstance(video, list): + frames = [transforms.ToTensor()(frame) for frame in video] + if frames: + frames_tensor = torch.stack(frames, dim=0) + grid = make_grid(frames_tensor, nrow=min(4, len(frames))) + grid_path = os.path.join(args.output_dir, f"{global_step}_video_{i}_CFG-9.png") + save_image(grid, grid_path) + if is_wandb_available(): + wandb.log( + {"generated_videos_grid": wandb.Image(grid, caption=f"video_{i}_grid")}, + step=global_step + 1, + ) + elif isinstance(video, torch.Tensor): + C, num_frames_video, H, W = video.shape + frames_list = [video[:, f, :, :] for f in range(num_frames_video)] + frames_tensor = torch.stack(frames_list, dim=0) + grid = make_grid(frames_tensor, nrow=min(4, num_frames_video)) + grid_path = os.path.join(args.output_dir, f"{global_step}_video_{i}_CFG-9.png") + save_image(grid, grid_path) + if is_wandb_available(): + wandb.log( + {"generated_videos_grid": wandb.Image(grid, caption=f"video_{i}_grid")}, + step=global_step + 1, + ) + + logger.info(f"Validation videos saved to {args.output_dir}") + except Exception as e: + logger.error(f"Video validation failed: {e}") + import traceback + traceback.print_exc() + finally: + # Clean up models loaded for validation (if using precomputed features) + if args.use_precomputed_features: + # Delete validation models to free GPU memory + if 'val_text_encoder' in locals(): + del val_text_encoder + if 'val_tokenizer' in locals(): + del val_tokenizer + if 'val_video_tokenizer' in locals(): + del val_video_tokenizer + if 'pipe' in locals(): + del pipe + if 'scheduler' in locals(): + del scheduler + # Clear CUDA cache + torch.cuda.empty_cache() + logger.info("Cleaned up validation models and freed GPU memory") + + model.train() + + if args.train_text_encoder and not args.use_precomputed_features: + # Only set train mode if text_encoder is still loaded (not using precomputed features) + if text_encoder is not None: + text_encoder.train() + + # EMA not used for video training + + global_step += 1 + + # Stop training if max steps is reached + if global_step >= args.max_train_steps: + break + # End for + + accelerator.wait_for_everyone() + + # Evaluate and save checkpoint at the end of training + save_checkpoint(args, accelerator, global_step, logger) + + # Save the final trained checkpoint + if accelerator.is_main_process: + model = safe_unwrap_model(model, accelerator) + # EMA not used for video training + model.save_pretrained(args.output_dir) + + accelerator.end_training() + + + + + +if __name__ == "__main__": + main(parse_args()) + + diff --git a/Meissonic/train/train_meissonic.py b/Meissonic/train/train_meissonic.py new file mode 100644 index 0000000000000000000000000000000000000000..432ff946f0fcf1a020e417a12d6afa1a048374f2 --- /dev/null +++ b/Meissonic/train/train_meissonic.py @@ -0,0 +1,1085 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import logging +import math +import os +from contextlib import nullcontext +from pathlib import Path +import sys +sys.path.append(os.getcwd()) +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from peft import LoraConfig +from peft.utils import get_peft_model_state_dict +from torch.utils.data import DataLoader, default_collate +from torchvision import transforms +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, +) +import diffusers.optimization +from diffusers import EMAModel, VQModel +from src.scheduler import Scheduler +from diffusers.loaders import LoraLoaderMixin +from diffusers.utils import is_wandb_available +from src.pipeline import Pipeline +from torchvision.utils import save_image,make_grid +from datasets import load_dataset +from train.trainer_utils import save_checkpoint +from train.dataset_utils import MyParquetDataset, HuggingFaceDataset +from train.dataset_utils import tokenize_prompt, encode_prompt +from src.transformer import Transformer2DModel + +if is_wandb_available(): + import wandb + # wandb.login(key="") + +logger = get_logger(__name__, log_level="INFO") + +import torch._dynamo +torch._dynamo.config.verbose = True + +# Optionally suppress errors to fall back to eager execution +torch._dynamo.config.suppress_errors = True + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained_model_architecture", + type=str, + default="Meissonic", + required=False + ) + parser.add_argument( + "--text_encoder_architecture", + type=str, + default="open_clip", + required=False, + help="The architecture of the text encoder. One of ['CLIP', 'open_clip', 'flan-t5-base','Qwen2-0.5B','gemini-2b',long_CLIP_T5_base','CLIP_T5_base']", + ) + parser.add_argument( + "--instance_dataset", + type=str, + default=None, + required=False, + help="The dataset to use for training. One of ['MSCOCO600K', 'PickaPicV2']", + ) + parser.add_argument( + "--training_from_scratch", + type=bool, + default=False, + required=False + ) + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) + parser.add_argument( + "--instance_data_dataset", + type=str, + default=None, + required=False, + help="A Hugging Face dataset containing the training images", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--instance_data_image", type=str, default=None, required=False, help="A single training image" + ) + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") + parser.add_argument("--ema_decay", type=float, default=0.9999) + parser.add_argument("--ema_update_after_step", type=int, default=0) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument( + "--output_dir", + type=str, + default="muse_training", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. " + "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference." + "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components." + "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step" + "instructions." + ), + ) + parser.add_argument( + "--logging_steps", + type=int, + default=50, + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more details" + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=0.0003, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`" + " and logging the images." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="wandb", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument("--validation_prompts", type=str, nargs="*") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument("--split_vae_encode", type=int, required=False, default=None) + parser.add_argument("--min_masking_rate", type=float, default=0.0) + parser.add_argument("--cond_dropout_prob", type=float, default=0.0) + parser.add_argument("--max_grad_norm", default=50.0, type=float, help="Max gradient norm.", required=False) + parser.add_argument("--use_lora", action="store_true", help="Fine tune the model using LoRa") + parser.add_argument("--text_encoder_use_lora", action="store_true", help="Fine tune the model using LoRa") + parser.add_argument("--lora_r", default=16, type=int) + parser.add_argument("--lora_alpha", default=32, type=int) + parser.add_argument("--lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+") + parser.add_argument("--text_encoder_lora_r", default=16, type=int) + parser.add_argument("--text_encoder_lora_alpha", default=32, type=int) + parser.add_argument("--text_encoder_lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+") + parser.add_argument("--train_text_encoder", action="store_true") + parser.add_argument("--image_key", type=str, required=False) + parser.add_argument("--prompt_key", type=str, required=False) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument("--prompt_prefix", type=str, required=False, default=None) + + args = parser.parse_args() + + if args.report_to == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + + num_datasources = sum( + [x is not None for x in [args.instance_data_dir, args.instance_data_image, args.instance_data_dataset]] + ) + + if num_datasources != 1: + raise ValueError( + "provide one and only one of `--instance_data_dir`, `--instance_data_image`, or `--instance_data_dataset`" + ) + + if args.instance_data_dir is not None: + if not os.path.exists(args.instance_data_dir): + raise ValueError(f"Does not exist: `--args.instance_data_dir` {args.instance_data_dir}") + + if args.instance_data_image is not None: + if not os.path.exists(args.instance_data_image): + raise ValueError(f"Does not exist: `--args.instance_data_image` {args.instance_data_image}") + + if args.instance_data_dataset is not None and (args.image_key is None or args.prompt_key is None): + raise ValueError("`--instance_data_dataset` requires setting `--image_key` and `--prompt_key`") + + return args + +def _prepare_latent_image_ids(batch_size, height, width, device, dtype): + latent_image_ids = torch.zeros(height // 2, width // 2, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :] + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + + latent_image_ids = latent_image_ids.reshape( + latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + # latent_image_ids = latent_image_ids.unsqueeze(0).repeat(batch_size, 1, 1) + + return latent_image_ids.to(device=device, dtype=dtype) + +def main(args): + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + # if args.pretrained_model_architecture == "Meissonic": + # from src.pipeline import Pipeline + # else: + # raise ValueError(f"Unknown model architecture: {args.pretrained_model_architecture}") + + + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_config=accelerator_project_config, + ) + + if accelerator.is_main_process: + os.makedirs(args.output_dir, exist_ok=True) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + + if accelerator.is_main_process: + accelerator.init_trackers("meissonic", config=vars(copy.deepcopy(args))) + + if args.seed is not None: + set_seed(args.seed) + + if args.text_encoder_architecture == "open_clip": + if args.resume_from_checkpoint: + text_encoder = CLIPTextModelWithProjection.from_pretrained( + args.resume_from_checkpoint, subfolder="text_encoder", variant=args.variant + ) + tokenizer = CLIPTokenizer.from_pretrained( + args.resume_from_checkpoint, subfolder="tokenizer", variant=args.variant + ) + else: + text_encoder = CLIPTextModelWithProjection.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", variant=args.variant + ) + tokenizer = CLIPTokenizer.from_pretrained( + args.pretrained_model_name_or_path, subfolder="tokenizer", variant=args.variant + ) + + # elif args.text_encoder_architecture == "CLIP_T5_base": + # text_encoder_clip = CLIPTextModelWithProjection.from_pretrained( + # args.pretrained_model_name_or_path, subfolder="text_encoder", variant=args.variant + # ) + # tokenizer_clip = CLIPTokenizer.from_pretrained( + # args.pretrained_model_name_or_path, subfolder="tokenizer", variant=args.variant + # ) + # from transformers import T5Tokenizer, T5ForConditionalGeneration + # text_encoder_t5 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base",torch_dtype=torch.float16) + # tokenizer_t5 = T5Tokenizer.from_pretrained("google/flan-t5-base",torch_dtype=torch.float16) + # text_encoder = [text_encoder_clip,text_encoder_t5] + # tokenizer = [tokenizer_clip,tokenizer_t5] + # elif args.text_encoder_architecture == "flan-t5-base": + # from transformers import T5Tokenizer, T5ForConditionalGeneration + # text_encoder = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base",torch_dtype=torch.float16) + # tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base",torch_dtype=torch.float16) + # elif args.text_encoder_architecture == "gemini-2b": + # raise NotImplementedError("Gemini-2b is not yet supported") + # elif args.text_encoder_architecture == "Qwen2-0.5B": + # raise NotImplementedError("Qwen2-0.5B is not yet supported") + else: + raise ValueError(f"Unknown text encoder architecture: {args.text_encoder_architecture}") + + vq_model = VQModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vqvae", revision=args.revision, variant=args.variant + ) + + if args.train_text_encoder: + if args.text_encoder_use_lora: + lora_config = LoraConfig( + r=args.text_encoder_lora_r, + lora_alpha=args.text_encoder_lora_alpha, + target_modules=args.text_encoder_lora_target_modules, + ) + text_encoder.add_adapter(lora_config) + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + text_encoder[0].train() + text_encoder[0].requires_grad_(True) + text_encoder[1].train() + text_encoder[1].requires_grad_(True) + else: + text_encoder.train() + text_encoder.requires_grad_(True) + else: + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + text_encoder[0].eval() + text_encoder[0].requires_grad_(False) + text_encoder[1].eval() + text_encoder[1].requires_grad_(False) + else: + text_encoder.eval() + text_encoder.requires_grad_(False) + + vq_model.requires_grad_(False) + + if args.pretrained_model_architecture == "Meissonic": + if args.training_from_scratch: + model = Transformer2DModel( + patch_size = 1, + in_channels = 64, + num_layers = 14, + num_single_layers = 28, + attention_head_dim = 128, + num_attention_heads = 8, + joint_attention_dim = 1024, + pooled_projection_dim = 1024, + guidance_embeds = False, + axes_dims_rope = (16, 56, 56), + downsample= True, + upsample= True, + ) + + # model_tmp = Transformer2DModel.from_pretrained("LAST_STAGE_CKPT_PATH", low_cpu_mem_usage=False, device_map=None) + # model.load_state_dict(model_tmp.state_dict(), strict=False) + # del model_tmp + else: + model = Transformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", low_cpu_mem_usage=False, device_map=None) + else: + raise ValueError(f"Unknown model architecture: {args.pretrained_model_architecture}") + + model = torch.compile(model) + + if args.use_lora: + lora_config = LoraConfig( + r=args.lora_r, + lora_alpha=args.lora_alpha, + target_modules=args.lora_target_modules, + ) + model.add_adapter(lora_config) + + model.train() + + if args.gradient_checkpointing: + model.enable_gradient_checkpointing() + if args.train_text_encoder: + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + text_encoder[0].gradient_checkpointing_enable() + text_encoder[1].gradient_checkpointing_enable() + else: + text_encoder.gradient_checkpointing_enable() + + if args.use_ema: # Not verify the robostness of this part + ema = EMAModel( + model.parameters(), + decay=args.ema_decay, + update_after_step=args.ema_update_after_step, + model_cls= Transformer2DModel, + model_config=model.config, + ) + + def save_model_hook(models, weights, output_dir): + if accelerator.is_main_process: + transformer_lora_layers_to_save = None + text_encoder_lora_layers_to_save = None + + for model_ in models: + if isinstance(model_, type(accelerator.unwrap_model(model))): + if args.use_lora: + transformer_lora_layers_to_save = get_peft_model_state_dict(model_) + else: + model_.save_pretrained(os.path.join(output_dir, "transformer")) + elif isinstance(model_, type(accelerator.unwrap_model(text_encoder))): + if args.text_encoder_use_lora: + text_encoder_lora_layers_to_save = get_peft_model_state_dict(model_) + else: + model_.save_pretrained(os.path.join(output_dir, "text_encoder")) + else: + raise ValueError(f"unexpected save model: {model_.__class__}") + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None: + LoraLoaderMixin.save_lora_weights( + output_dir, + unet_lora_layers=transformer_lora_layers_to_save, + text_encoder_lora_layers=text_encoder_lora_layers_to_save, + ) + + if args.use_ema: + ema.save_pretrained(os.path.join(output_dir, "ema_model")) + + def load_model_hook(models, input_dir): + transformer = None + text_encoder_ = None + + # this part is added for keep consistency when add model.compile() in the model + def adap_compile(ori_dict):#add '_orig_mod.' to each key + new_dict = {} + for k,v in ori_dict.items(): + new_dict['_orig_mod.'+k] = v + return new_dict + + while len(models) > 0: + model_ = models.pop() + + if isinstance(model_, type(accelerator.unwrap_model(model))): + if args.use_lora: + transformer = model_ + else: + if args.pretrained_model_architecture == "Meissonic": + load_model = Transformer2DModel.from_pretrained(os.path.join(input_dir, "transformer"), low_cpu_mem_usage=False, device_map=None) + else: + raise ValueError(f"Unknown model architecture: {args.pretrained_model_architecture}") + model_.load_state_dict(adap_compile(load_model.state_dict())) + del load_model + elif isinstance(model_, type(accelerator.unwrap_model(text_encoder))): + if args.text_encoder_use_lora: + text_encoder_ = model_ + else: + try: + load_model = CLIPTextModelWithProjection.from_pretrained(os.path.join(input_dir, "text_encoder")) + model_.load_state_dict(load_model.state_dict()) + # print('finished loading text encoder!') + except: + print('Not found text-encoder model in current folder. So we download one text encoder from Internet.') + load_model = CLIPTextModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") + model_.load_state_dict(load_model.state_dict()) + del load_model + else: + raise ValueError(f"unexpected save model: {model.__class__}") + + if transformer is not None or text_encoder_ is not None: + lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + LoraLoaderMixin.load_lora_into_text_encoder( + lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_ + ) + LoraLoaderMixin.load_lora_into_transformer( + lora_state_dict, network_alphas=network_alphas, transformer=transformer + ) + + if args.use_ema: + load_from = EMAModel.from_pretrained(os.path.join(input_dir, "ema_model"), model_cls=Transformer2DModel) + ema.load_state_dict(adap_compile(load_from.state_dict())) + del load_from + + accelerator.register_load_state_pre_hook(load_model_hook) + accelerator.register_save_state_pre_hook(save_model_hook) + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + ) + + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`" + ) + + optimizer_cls = bnb.optim.AdamW8bit + else: + optimizer_cls = torch.optim.AdamW + + # no decay on bias and layernorm and embedding + no_decay = ["bias", "layer_norm.weight", "mlm_ln.weight", "embeddings.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.adam_weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + + if args.train_text_encoder: + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + optimizer_grouped_parameters.append( + {"params": text_encoder[0].parameters(), "weight_decay": args.adam_weight_decay} + ) + optimizer_grouped_parameters.append( + {"params": text_encoder[1].parameters(), "weight_decay": args.adam_weight_decay} + ) + else: + optimizer_grouped_parameters.append( + {"params": text_encoder.parameters(), "weight_decay": args.adam_weight_decay} + ) + + optimizer = optimizer_cls( + optimizer_grouped_parameters, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + logger.info("Creating dataloaders and lr_scheduler") + + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + if args.instance_dataset == "MyParquetDataset": + dataset = MyParquetDataset( + root_dir=args.instance_data_dir, # something like '../parquets_father_dir/' + tokenizer=tokenizer, + size=args.resolution, + text_encoder_architecture=args.text_encoder_architecture + ) + elif args.instance_dataset == 'HuggingFaceDataset': # you can try this first, just download dataset from huggingface + dataset = HuggingFaceDataset( + hf_dataset=load_dataset(args.instance_data_dir, split="train"), # something like '../parquets_father_dir/' + tokenizer=tokenizer, + image_key='image', + prompt_key='caption', + prompt_prefix=args.prompt_prefix, + size=args.resolution, + text_encoder_architecture=args.text_encoder_architecture + ) + elif args.instance_dataset == "DATA_TYPE": + raise NotImplementedError("DATA_TYPE is not yet supported") + # Some instructions + # Origanize your text-image pairs in the following way: + # when apply __getitem__ method, return a dictionary with keys 'image', 'micro_conds' and 'prompt_input_ids' + # For more details to follow, please refer to the implementation of MyParquetDataset class + else: + assert False + + train_dataloader = DataLoader( + dataset, + batch_size=args.train_batch_size, + shuffle=True, + num_workers=args.dataloader_num_workers, + collate_fn=default_collate, + pin_memory=True, + ) + train_dataloader.num_batches = len(train_dataloader) + + lr_scheduler = diffusers.optimization.get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_training_steps=args.max_train_steps * accelerator.num_processes, + num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes, + ) + + logger.info("Preparing model, optimizer and dataloaders") + + if args.train_text_encoder: + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + model, optimizer, lr_scheduler, train_dataloader, text_encoder[0], text_encoder[1] = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader, text_encoder[0], text_encoder[1] + ) + else: + model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader, text_encoder + ) + else: + model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader + ) + + train_dataloader.num_batches = len(train_dataloader) + + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + if not args.train_text_encoder: + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + text_encoder[0].to(device=accelerator.device, dtype=weight_dtype) + text_encoder[1].to(device=accelerator.device, dtype=weight_dtype) + else: + text_encoder.to(device=accelerator.device, dtype=weight_dtype) + + vq_model.to(device=accelerator.device) + + if args.use_ema: + ema.to(accelerator.device) + + with nullcontext() if args.train_text_encoder else torch.no_grad(): + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + _input_ids_tmp_ = tokenize_prompt(tokenizer, "", args.text_encoder_architecture) + _input_ids_tmp_[0] = _input_ids_tmp_[0].to(accelerator.device, non_blocking=True) + _input_ids_tmp_[1] = _input_ids_tmp_[1].to(accelerator.device, non_blocking=True) + empty_embeds, empty_clip_embeds = encode_prompt( + text_encoder, _input_ids_tmp_, args.text_encoder_architecture + ) + else: + empty_embeds, empty_clip_embeds = encode_prompt( + text_encoder, tokenize_prompt(tokenizer, "", args.text_encoder_architecture).to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + + # There is a single image, we can just pre-encode the single prompt + if args.instance_data_image is not None: + prompt = os.path.splitext(os.path.basename(args.instance_data_image))[0] + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + _input_ids_tmp_ = tokenize_prompt(tokenizer, prompt, args.text_encoder_architecture) + _input_ids_tmp_[0] = _input_ids_tmp_[0].to(accelerator.device, non_blocking=True) + _input_ids_tmp_[1] = _input_ids_tmp_[1].to(accelerator.device, non_blocking=True) + empty_embeds, empty_clip_embeds = encode_prompt( + text_encoder, _input_ids_tmp_, args.text_encoder_architecture + ) + else: + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, tokenize_prompt(tokenizer, prompt, args.text_encoder_architecture).to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + encoder_hidden_states = encoder_hidden_states.repeat(args.train_batch_size, 1, 1) + cond_embeds = cond_embeds.repeat(args.train_batch_size, 1) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps) + # Afterwards we recalculate our number of training epochs. + # Note: We are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num training steps = {args.max_train_steps}") + logger.info(f" Instantaneous batch size per device = { args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + + resume_from_checkpoint = args.resume_from_checkpoint + if resume_from_checkpoint: + if resume_from_checkpoint == "latest": + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + if len(dirs) > 0: + resume_from_checkpoint = os.path.join(args.output_dir, dirs[-1]) + else: + resume_from_checkpoint = None + + if resume_from_checkpoint is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + else: + accelerator.print(f"Resuming from checkpoint {resume_from_checkpoint}") + + if resume_from_checkpoint is None: + global_step = 0 + first_epoch = 0 + else: + accelerator.load_state(resume_from_checkpoint) + global_step = int(os.path.basename(resume_from_checkpoint).split("-")[1]) + first_epoch = global_step // num_update_steps_per_epoch + + # This is to solve the inconsistent tensor device issue + if args.use_ema: + ema.shadow_params = [p.to(accelerator.device) for p in ema.shadow_params] + + # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + for epoch in range(first_epoch, num_train_epochs): + for batch in train_dataloader: + torch.cuda.empty_cache() + with torch.no_grad(): + micro_conds = batch["micro_conds"].to(accelerator.device, non_blocking=True) + pixel_values = batch["image"].to(accelerator.device, non_blocking=True) + + batch_size = pixel_values.shape[0] + + split_batch_size = args.split_vae_encode if args.split_vae_encode is not None else batch_size + num_splits = math.ceil(batch_size / split_batch_size) + image_tokens = [] + for i in range(num_splits): + start_idx = i * split_batch_size + end_idx = min((i + 1) * split_batch_size, batch_size) + bs = pixel_values.shape[0] + image_tokens.append( + vq_model.quantize(vq_model.encode(pixel_values[start_idx:end_idx]).latents)[2][2].reshape( + split_batch_size, -1 + ) + ) + image_tokens = torch.cat(image_tokens, dim=0) + + batch_size, seq_len = image_tokens.shape + + timesteps = torch.rand(batch_size, device=image_tokens.device) + mask_prob = torch.cos(timesteps * math.pi * 0.5) + mask_prob = mask_prob.clip(args.min_masking_rate) + + num_token_masked = (seq_len * mask_prob).round().clamp(min=1) + batch_randperm = torch.rand(batch_size, seq_len, device=image_tokens.device).argsort(dim=-1) + mask = batch_randperm < num_token_masked.unsqueeze(-1) + + mask_id = accelerator.unwrap_model(model).config.vocab_size - 1 + input_ids = torch.where(mask, mask_id, image_tokens) + labels = torch.where(mask, image_tokens, -100) + + if "prompt_input_ids" in batch: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + batch["prompt_input_ids"][0] = batch["prompt_input_ids"][0].to(accelerator.device, non_blocking=True) + batch["prompt_input_ids"][1] = batch["prompt_input_ids"][1].to(accelerator.device, non_blocking=True) + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"], args.text_encoder_architecture + ) + else: + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True), args.text_encoder_architecture + ) + + if args.cond_dropout_prob > 0.0: + assert encoder_hidden_states is not None + + batch_size = encoder_hidden_states.shape[0] + + mask = ( + torch.zeros((batch_size, 1, 1), device=encoder_hidden_states.device).float().uniform_(0, 1) + < args.cond_dropout_prob + ) + + empty_embeds_ = empty_embeds.expand(batch_size, -1, -1) + encoder_hidden_states = torch.where( + (encoder_hidden_states * mask).bool(), encoder_hidden_states, empty_embeds_ + ) + + empty_clip_embeds_ = empty_clip_embeds.expand(batch_size, -1) + cond_embeds = torch.where((cond_embeds * mask.squeeze(-1)).bool(), cond_embeds, empty_clip_embeds_) + + bs = input_ids.shape[0] + vae_scale_factor = 2 ** (len(vq_model.config.block_out_channels) - 1) + resolution = args.resolution // vae_scale_factor + input_ids = input_ids.reshape(bs, resolution, resolution) + + if "prompt_input_ids" in batch: + with nullcontext() if args.train_text_encoder else torch.no_grad(): + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + batch["prompt_input_ids"][0] = batch["prompt_input_ids"][0].to(accelerator.device, non_blocking=True) + batch["prompt_input_ids"][1] = batch["prompt_input_ids"][1].to(accelerator.device, non_blocking=True) + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"],args.text_encoder_architecture + ) + else: + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True),args.text_encoder_architecture + ) + + # Train Step + with accelerator.accumulate(model): + codebook_size = accelerator.unwrap_model(model).config.codebook_size + + if args.pretrained_model_architecture == 'Meissonic': + + if args.resolution == 1024: # only stage 3 and stage 4 do not apply 2* + img_ids = _prepare_latent_image_ids(input_ids.shape[0], input_ids.shape[-2],input_ids.shape[-1],input_ids.device,input_ids.dtype) + else: + img_ids = _prepare_latent_image_ids(input_ids.shape[0],2*input_ids.shape[-2],2*input_ids.shape[-1],input_ids.device,input_ids.dtype) + + txt_ids = torch.zeros(encoder_hidden_states.shape[1],3).to(device = input_ids.device, dtype = input_ids.dtype) + + logits = ( + model( + hidden_states=input_ids, # should be (batch size, channel, height, width) + encoder_hidden_states=encoder_hidden_states, # should be (batch size, sequence_len, embed_dims) + micro_conds=micro_conds, # + pooled_projections=cond_embeds, # should be (batch_size, projection_dim) + img_ids = img_ids, + txt_ids = txt_ids, + # timestep = timesteps * 20, + timestep = mask_prob * 1000, + # guidance = 9, + ) + .reshape(bs, codebook_size, -1) + .permute(0, 2, 1) + .reshape(-1, codebook_size) + ) + else: + raise ValueError(f"Unknown model architecture: {args.pretrained_model_architecture}") + + loss = F.cross_entropy( + logits, + labels.view(-1), + ignore_index=-100, + reduction="mean", + ) + + # Gather the losses across all processes for logging (if we use distributed training). + avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() + avg_masking_rate = accelerator.gather(mask_prob.repeat(args.train_batch_size)).mean() + + accelerator.backward(loss) + + if args.max_grad_norm is not None and accelerator.sync_gradients: + accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + + optimizer.zero_grad(set_to_none=True) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + if args.use_ema: + ema.step(model.parameters()) + + if (global_step + 1) % args.logging_steps == 0: + logs = { + "step_loss": avg_loss.item(), + "lr": lr_scheduler.get_last_lr()[0], + "avg_masking_rate": avg_masking_rate.item(), + } + accelerator.log(logs, step=global_step + 1) + + logger.info( + f"Step: {global_step + 1} " + f"Loss: {avg_loss.item():0.4f} " + f"LR: {lr_scheduler.get_last_lr()[0]:0.6f}" + ) + + if (global_step + 1) % args.checkpointing_steps == 0: + save_checkpoint(args, accelerator, global_step + 1, logger) + + if (global_step + 1) % args.validation_steps == 0 and accelerator.is_main_process: + if args.use_ema: + ema.store(model.parameters()) + ema.copy_to(model.parameters()) + + with torch.no_grad(): + logger.info("Generating images...") + + model.eval() + + if args.train_text_encoder: + text_encoder.eval() + + scheduler = Scheduler.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="scheduler", + revision=args.revision, + variant=args.variant, + ) + if args.text_encoder_architecture == "CLIP" or args.text_encoder_architecture == "open_clip": + pipe = Pipeline( + transformer=accelerator.unwrap_model(model), + tokenizer=tokenizer, + text_encoder=text_encoder, + vqvae=vq_model, + scheduler=scheduler, + ) + else: + pipe = Pipeline( + transformer=accelerator.unwrap_model(model), + tokenizer=tokenizer[0], + text_encoder=text_encoder[0], + vqvae=vq_model, + scheduler=scheduler, + text_encoder_t5=text_encoder[1], + tokenizer_t5=tokenizer[1] + ) + + + + + + pil_images = pipe(prompt=args.validation_prompts,height=args.resolution,width=args.resolution,guidance_scale=9,num_inference_steps=64).images + wandb_images = [ + wandb.Image(image, caption=args.validation_prompts[i]) + for i, image in enumerate(pil_images) + ] + + wandb.log({"generated_images": wandb_images}, step=global_step + 1) + + result=[] + for img in pil_images: + if not isinstance(img, torch.Tensor): + img = transforms.ToTensor()(img) + result.append(img.unsqueeze(0)) + result = torch.cat(result,dim=0) + result = make_grid(result, nrow=3) + save_image(result,os.path.join(args.output_dir,str(global_step)+'_text2image_1024_CFG-9.png')) + + + # pil_images = pipe(prompt=args.validation_prompts,height=args.resolution,width=args.resolution,guidance_scale=9).images + # result=[] + # for img in pil_images: + # if not isinstance(img, torch.Tensor): + # img = transforms.ToTensor()(img) + # result.append(img.unsqueeze(0)) + # result = torch.cat(result,dim=0) + # result = make_grid(result, nrow=3) + # save_image(result,os.path.join(args.output_dir,str(global_step)+'_text2image_1024_CFG-9.png')) + + + + model.train() + + if args.train_text_encoder: + if args.text_encoder_architecture == "CLIP_T5_base": # Not support yet. Only support open_clip + text_encoder[0].train() + text_encoder[1].trian() + else: + text_encoder.train() + + if args.use_ema: + ema.restore(model.parameters()) + + global_step += 1 + + # Stop training if max steps is reached + if global_step >= args.max_train_steps: + break + # End for + + accelerator.wait_for_everyone() + + # Evaluate and save checkpoint at the end of training + save_checkpoint(args, accelerator, global_step, logger) + + # Save the final trained checkpoint + if accelerator.is_main_process: + model = accelerator.unwrap_model(model) + if args.use_ema: + ema.copy_to(model.parameters()) + model.save_pretrained(args.output_dir) + + accelerator.end_training() + + + + + +if __name__ == "__main__": + main(parse_args()) + + diff --git a/Meissonic/train/train_overfit.py b/Meissonic/train/train_overfit.py new file mode 100644 index 0000000000000000000000000000000000000000..386a82b4cefcf1e5c1742d44cba6efd3e5b444b9 --- /dev/null +++ b/Meissonic/train/train_overfit.py @@ -0,0 +1,624 @@ +#!/usr/bin/env python3 +""" +Overfitting experiment script to verify implementation correctness. + +This script trains on a tiny subset (128-256 videos) with: +- High learning rate (5e-4 to 1e-3) +- Simple constant/warmup scheduler +- Small batch size (4-8) +- Fixed seed for reproducibility +- 2k-5k steps + +Expected behavior if implementation is correct: +- Loss should drop to 5-6 or even lower (0.x) +- Loss should continue decreasing, indicating ability to overfit + +If loss stays high (9-10) or diverges, there's likely a bug in: +- mask_token logic +- scheduler +- label alignment +- logits reshaping +""" + +import argparse +import logging +import os +import sys +sys.path.append(os.getcwd()) + +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from torch.utils.data import DataLoader, default_collate +from transformers import T5Tokenizer, T5EncoderModel + +from src.scheduler_video import Scheduler +from src.pipeline_video import CosmosVideoTokenizer, Pipeline as VideoPipeline +from src.transformer_video import WanDiscreteVideoTransformer, WanModel +from train.dataset_utils import TinyOpenVid1MDataset, tokenize_prompt, encode_prompt +from torchvision import transforms +from torchvision.utils import save_image, make_grid + +logger = get_logger(__name__, log_level="INFO") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Overfitting experiment for video diffusion model") + + # Model args + parser.add_argument("--text_encoder_architecture", type=str, default="umt5-base") + parser.add_argument("--video_tokenizer_model_id", type=str, default="Cosmos-1.0-Tokenizer-DV8x16x16") + parser.add_argument("--wan_pretrained_path", type=str, default=None, help="Path to pretrained Wan weights") + + # Dataset args + parser.add_argument("--instance_data_dir", type=str, required=True, help="Path to OpenVid1M CSV file") + parser.add_argument("--max_samples", type=int, default=256, help="Number of samples for overfitting (128-256)") + parser.add_argument("--num_frames", type=int, default=16) + parser.add_argument("--video_height", type=int, default=480) + parser.add_argument("--video_width", type=int, default=848) + + # Training args + parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size (4-8 for overfitting)") + parser.add_argument("--learning_rate", type=float, default=5e-4, help="High LR for overfitting (5e-4 to 1e-3)") + parser.add_argument("--max_train_steps", type=int, default=3000, help="Steps for overfitting (2k-5k)") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--lr_warmup_steps", type=int, default=100, help="Small warmup for overfitting") + parser.add_argument("--gradient_checkpointing", action="store_true") + parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["no", "fp16", "bf16"]) + + # Other args + parser.add_argument("--seed", type=int, default=42, help="Fixed seed for reproducibility") + parser.add_argument("--output_dir", type=str, default="./output_overfit") + parser.add_argument("--logging_steps", type=int, default=50) + parser.add_argument("--save_steps", type=int, default=500) + parser.add_argument("--inference_steps", type=int, default=500, help="Steps interval for inference (default: 500)") + parser.add_argument("--num_inference_samples", type=int, default=4, help="Number of prompts to use for inference") + parser.add_argument("--num_inference_steps", type=int, default=48, help="Number of inference steps for generation") + parser.add_argument("--dataloader_num_workers", type=int, default=4) + + return parser.parse_args() + + +def save_video_frames(video, output_path, prompt_text=""): + """ + Save video frames as a grid image and individual frames. + + Args: + video: Can be list of PIL Images or torch.Tensor [C, F, H, W] + output_path: Base path for saving (without extension) + prompt_text: Prompt text for filename + """ + import numpy as np + from PIL import Image + + # Convert to list of PIL Images if needed + if isinstance(video, torch.Tensor): + # video: [C, F, H, W] in [0, 1] + C, F, H, W = video.shape + frames = [] + for f in range(F): + frame = video[:, f, :, :].cpu().numpy() # [C, H, W] + frame = np.transpose(frame, (1, 2, 0)) # [H, W, C] + frame = (frame * 255).astype(np.uint8) + frames.append(Image.fromarray(frame)) + video = frames + elif isinstance(video, list): + frames = video + else: + logger.warning(f"Unknown video type: {type(video)}") + return + + if not frames: + logger.warning(f"No frames to save for {output_path}") + return + + # Save grid of all frames + frames_tensor = torch.stack([transforms.ToTensor()(frame) for frame in frames], dim=0) + grid = make_grid(frames_tensor, nrow=min(4, len(frames))) + grid_path = f"{output_path}_grid.png" + save_image(grid, grid_path) + logger.info(f"Saved video grid to {grid_path}") + + # Save individual frames + frames_dir = f"{output_path}_frames" + os.makedirs(frames_dir, exist_ok=True) + for i, frame in enumerate(frames): + frame_path = os.path.join(frames_dir, f"frame_{i:03d}.png") + frame.save(frame_path) + + # Save as GIF + try: + gif_path = f"{output_path}.gif" + frames[0].save( + gif_path, + save_all=True, + append_images=frames[1:], + duration=200, # 200ms per frame + loop=0 + ) + logger.info(f"Saved video GIF to {gif_path}") + except Exception as e: + logger.warning(f"Failed to save GIF: {e}") + + +def main(): + args = parse_args() + + # Set seed for reproducibility + set_seed(args.seed) + + # Initialize accelerator + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=None, # Disable wandb for overfitting experiment + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + + if accelerator.is_local_main_process: + os.makedirs(args.output_dir, exist_ok=True) + + # Initialize tokenizer and text encoder + logger.info("Initializing text encoder...") + if args.text_encoder_architecture in ["umt5-base", "t5"]: + tokenizer = T5Tokenizer.from_pretrained("google/umt5-base") + text_encoder = T5EncoderModel.from_pretrained("google/umt5-base") + else: + raise ValueError(f"Unsupported text encoder: {args.text_encoder_architecture}") + + text_encoder.requires_grad_(False) + text_encoder.eval() + text_dim_actual = text_encoder.config.d_model + + # Initialize video tokenizer + logger.info("Initializing video tokenizer...") + device = accelerator.device + dtype = torch.bfloat16 if args.mixed_precision == "bf16" else (torch.float16 if args.mixed_precision == "fp16" else torch.float32) + video_tokenizer = CosmosVideoTokenizer( + model_id=args.video_tokenizer_model_id, + device=device, + dtype=dtype, + ) + video_tokenizer.requires_grad_(False) + + # Calculate compressed dimensions + t_ds = video_tokenizer.t_downsample + h_ds = video_tokenizer.h_downsample + w_ds = video_tokenizer.w_downsample + F_prime = args.num_frames // t_ds + H_prime = args.video_height // h_ds + W_prime = args.video_width // w_ds + + # Initialize transformer model + logger.info("Initializing transformer model...") + + # Try to load Wan config from pretrained weights if provided + wan_config = None + if args.wan_pretrained_path: + try: + if os.path.isdir(args.wan_pretrained_path): + # Local directory + config_path = os.path.join(args.wan_pretrained_path, "config.json") + if os.path.exists(config_path): + import json + with open(config_path, 'r') as f: + wan_config = json.load(f) + else: + # HuggingFace Hub - try to load config + try: + from types import SimpleNamespace + temp_model = WanModel.from_pretrained(args.wan_pretrained_path, subfolder=None) + wan_config = SimpleNamespace( + dim=temp_model.dim, + ffn_dim=temp_model.ffn_dim, + num_layers=temp_model.num_layers, + num_heads=temp_model.num_heads, + freq_dim=temp_model.freq_dim, + in_dim=temp_model.in_dim, + out_dim=temp_model.out_dim, + text_dim=getattr(temp_model, 'text_dim', None), + ) + del temp_model + except: + pass + except Exception as e: + logger.warning(f"Failed to load Wan config: {e}") + + # Use Wan config if available, otherwise use defaults + if wan_config: + dim = wan_config.dim + ffn_dim = wan_config.ffn_dim + num_layers = wan_config.num_layers + num_heads = wan_config.num_heads + freq_dim = wan_config.freq_dim + in_dim = wan_config.in_dim + out_dim = wan_config.out_dim + text_dim_for_model = wan_config.text_dim if wan_config.text_dim else text_dim_actual + else: + # Default values + dim = 2048 + ffn_dim = 8192 + num_layers = 32 + num_heads = 16 + freq_dim = 256 + in_dim = 16 + out_dim = 16 + text_dim_for_model = text_dim_actual + + # Override text_dim with actual text encoder dimension + if text_dim_for_model != text_dim_actual: + logger.warning(f"Wan config text_dim ({text_dim_for_model}) != text encoder dim ({text_dim_actual}), using {text_dim_actual}") + text_dim_for_model = text_dim_actual + + model = WanDiscreteVideoTransformer( + codebook_size=video_tokenizer.codebook_size, + vocab_size=video_tokenizer.codebook_size + 1, + num_frames=F_prime, + height=H_prime, + width=W_prime, + text_dim=text_dim_for_model, + dim=dim, + ffn_dim=ffn_dim, + num_layers=num_layers, + num_heads=num_heads, + freq_dim=freq_dim, + in_dim=in_dim, + out_dim=out_dim, + ) + + # Load pretrained weights if provided + if args.wan_pretrained_path: + logger.info(f"Loading pretrained weights from {args.wan_pretrained_path}...") + try: + if os.path.isdir(args.wan_pretrained_path): + state_dict_path = os.path.join(args.wan_pretrained_path, "diffusion_pytorch_model.safetensors") + if not os.path.exists(state_dict_path): + state_dict_path = os.path.join(args.wan_pretrained_path, "pytorch_model.bin") + + if os.path.exists(state_dict_path): + from safetensors import safe_open + wan_state_dict = {} + if state_dict_path.endswith('.safetensors'): + with safe_open(state_dict_path, framework="pt", device="cpu") as f: + for k in f.keys(): + wan_state_dict[k] = f.get_tensor(k) + else: + wan_state_dict = torch.load(state_dict_path, map_location="cpu") + else: + raise FileNotFoundError(f"State dict not found in {args.wan_pretrained_path}") + else: + # HuggingFace Hub + temp_model = WanModel.from_pretrained(args.wan_pretrained_path, subfolder=None) + wan_state_dict = temp_model.state_dict() + del temp_model + + # Remove text_embedding weights if shape doesn't match + text_embedding_key = 'text_embedding.0.weight' + if text_embedding_key in wan_state_dict: + pretrained_text_dim = wan_state_dict[text_embedding_key].shape[1] + model_text_dim = model.backbone.text_embedding[0].weight.shape[1] + + if pretrained_text_dim != model_text_dim: + keys_to_remove = [k for k in wan_state_dict.keys() if 'text_embedding' in k] + for k in keys_to_remove: + del wan_state_dict[k] + logger.info(f"Removed {len(keys_to_remove)} text_embedding keys due to dimension mismatch") + + missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) + if missing_keys: + logger.warning(f"Missing keys: {missing_keys[:10]}...") + if unexpected_keys: + logger.warning(f"Unexpected keys: {unexpected_keys[:10]}...") + logger.info("Successfully loaded pretrained weights") + except Exception as e: + logger.warning(f"Failed to load pretrained weights: {e}") + + # Initialize scheduler + logger.info("Initializing scheduler...") + scheduler = Scheduler( + mask_token_id=video_tokenizer.mask_token_id, + masking_schedule="cosine", + ) + + # Setup optimizer + logger.info("Setting up optimizer...") + optimizer = torch.optim.AdamW( + model.parameters(), + lr=args.learning_rate, + betas=(0.9, 0.999), + weight_decay=0.01, + eps=1e-8, + ) + + # Simple constant scheduler with warmup + def lr_lambda(current_step): + if current_step < args.lr_warmup_steps: + return float(current_step) / float(max(1, args.lr_warmup_steps)) + return 1.0 + + lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # Create tiny dataset + logger.info(f"Creating tiny dataset with {args.max_samples} samples...") + + # Auto-detect video_root_dir if not provided + csv_path = args.instance_data_dir + csv_dir = os.path.dirname(csv_path) + if os.path.exists(os.path.join(csv_dir, 'video_reorg')): + video_root_dir = os.path.join(csv_dir, 'video_reorg') + elif os.path.exists(os.path.join(os.path.dirname(csv_dir), 'video_reorg')): + video_root_dir = os.path.join(os.path.dirname(csv_dir), 'video_reorg') + else: + # Fallback: use CSV directory + video_root_dir = csv_dir + logger.warning(f"Video directory not found, using CSV directory: {video_root_dir}") + + dataset = TinyOpenVid1MDataset( + csv_path=csv_path, + video_root_dir=video_root_dir, + tokenizer=tokenizer, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + text_encoder_architecture=args.text_encoder_architecture, + max_samples=args.max_samples, + seed=args.seed, + ) + + # Create dataloader + train_dataloader = DataLoader( + dataset, + batch_size=args.train_batch_size, + shuffle=True, + num_workers=args.dataloader_num_workers, + collate_fn=default_collate, + pin_memory=True, + prefetch_factor=2 if args.dataloader_num_workers > 0 else None, + persistent_workers=args.dataloader_num_workers > 0, + ) + + logger.info(f"Dataset size: {len(dataset)}") + logger.info(f"Dataloader batches: {len(train_dataloader)}") + + # Enable gradient checkpointing if requested (before prepare) + if args.gradient_checkpointing: + model.enable_gradient_checkpointing() + + # Prepare with accelerator + model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare( + model, optimizer, lr_scheduler, train_dataloader, text_encoder + ) + + # Training loop + logger.info("Starting overfitting experiment...") + logger.info(f"Target: Loss should drop to 5-6 or lower within {args.max_train_steps} steps") + logger.info(f"If loss stays high (9-10) or diverges, there's likely a bug") + + model.train() + global_step = 0 + + for epoch in range(1000): # Large number, will break on step limit + for batch in train_dataloader: + with accelerator.accumulate(model): + # Get video and encode + video_values = batch["video"].to(device=accelerator.device, dtype=torch.float32) + video_tokens = video_tokenizer.encode(video_values) # [B, F', H', W'] + + # Flatten for masking + B, F_prime_vid, H_prime_vid, W_prime_vid = video_tokens.shape + video_tokens_flat = video_tokens.view(B, -1) + seq_len = video_tokens_flat.shape[1] + + # Apply masking: use per-sample masking like train_mei_video.py + mask_prob = torch.rand(B, device=video_tokens_flat.device) * 0.5 + 0.1 # 0.1 to 0.6 per sample + num_token_masked = (seq_len * mask_prob).round().clamp(min=1) + batch_randperm = torch.rand(B, seq_len, device=video_tokens_flat.device).argsort(dim=-1) + mask = batch_randperm < num_token_masked.unsqueeze(-1) + + # Create input_ids: masked positions get mask_token_id, others keep original tokens + mask_id = video_tokenizer.mask_token_id # codebook_size + input_ids_flat = torch.where(mask, mask_id, video_tokens_flat) + # Create labels: masked positions get original tokens (for loss), others get -100 (ignored) + labels_flat = torch.where(mask, video_tokens_flat, -100) + + # Reshape back to [B, F', H', W'] for model forward + input_ids = input_ids_flat.view(B, F_prime_vid, H_prime_vid, W_prime_vid) + labels = labels_flat.view(B, F_prime_vid, H_prime_vid, W_prime_vid) + + # Use average mask ratio for timestep + mask_ratio = mask_prob.mean().item() + + # Encode text + encoder_hidden_states, cond_embeds = encode_prompt( + text_encoder, + batch["prompt_input_ids"].to(device=accelerator.device), + args.text_encoder_architecture + ) + + # Forward pass + logits = model( + tokens=input_ids, + timesteps=torch.full((B,), int(mask_ratio * 1000), device=accelerator.device, dtype=torch.long), + encoder_hidden_states=encoder_hidden_states, + y=None, + ) + + # Reshape for loss + # logits: [B, vocab_size, F', H', W'] -> [B*F'*H'*W', vocab_size] + B_logits, vocab_size, F_prime_logits, H_prime_logits, W_prime_logits = logits.shape + logits = logits.permute(0, 2, 3, 4, 1).reshape(B_logits * F_prime_logits * H_prime_logits * W_prime_logits, vocab_size) + + # labels: [B, F', H', W'] - crop to match logits dimensions if needed + B_labels, F_prime_labels, H_prime_labels, W_prime_labels = labels.shape + assert B_logits == B_labels, f"Batch size mismatch: logits {B_logits} vs labels {B_labels}" + + # Crop labels to match logits spatial dimensions + if F_prime_labels != F_prime_logits or H_prime_labels != H_prime_logits or W_prime_labels != W_prime_logits: + labels = labels[:, :F_prime_logits, :H_prime_logits, :W_prime_logits] + + # labels: [B, F', H', W'] -> [B*F'*H'*W'] + labels_flat = labels.reshape(-1) + + # Verify label values are in valid range [0, codebook_size-1] or -100 (ignored) + codebook_size = video_tokenizer.codebook_size + valid_labels = labels_flat[(labels_flat >= 0) & (labels_flat != -100)] + if len(valid_labels) > 0: + assert valid_labels.min() >= 0 and valid_labels.max() < codebook_size, ( + f"Label values out of range: min={valid_labels.min()}, max={valid_labels.max()}, " + f"expected [0, {codebook_size-1}]" + ) + + # Compute loss: only on masked positions (labels != -100), ignore unmasked positions + # vocab_size = codebook_size + 1 (includes mask_token_id = codebook_size) + # labels are in [0, codebook_size-1] range (Cosmos tokens), which map directly to logits indices [0, codebook_size-1] + loss = F.cross_entropy( + logits, + labels_flat, + ignore_index=-100, # Ignore unmasked positions + reduction="mean", + ) + + # Backward + accelerator.backward(loss) + if accelerator.sync_gradients: + accelerator.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + if accelerator.sync_gradients: + global_step += 1 + + if global_step % args.logging_steps == 0: + logger.info(f"Step {global_step}/{args.max_train_steps}, Loss: {loss.item():.4f}, LR: {lr_scheduler.get_last_lr()[0]:.2e}") + + if global_step % args.save_steps == 0: + if accelerator.is_main_process: + checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + os.makedirs(checkpoint_path, exist_ok=True) + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained(checkpoint_path) + logger.info(f"Saved checkpoint to {checkpoint_path}") + + # Inference: generate videos using training data prompts + if global_step % args.inference_steps == 0 and global_step > 0: + if accelerator.is_main_process: + logger.info(f"Step {global_step}: Generating videos for inference...") + + # Sample prompts from training dataset (get original captions from dataset.data) + inference_indices = torch.randperm(len(dataset), generator=torch.Generator().manual_seed(args.seed))[:args.num_inference_samples].tolist() + inference_prompts = [] + for idx in inference_indices: + # Get original caption from dataset + row = dataset.data[idx] + prompt_text = row['caption'] + if dataset.prompt_prefix is not None: + prompt_text = dataset.prompt_prefix + prompt_text + inference_prompts.append(prompt_text) + + logger.info(f"Using prompts: {inference_prompts[:2]}...") # Log first 2 prompts + + try: + # Create inference pipeline + model.eval() + text_encoder.eval() + + # Get unwrapped model and ensure correct dtype + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_text_encoder = accelerator.unwrap_model(text_encoder) + weight_dtype = torch.bfloat16 if args.mixed_precision == "bf16" else (torch.float16 if args.mixed_precision == "fp16" else torch.float32) + unwrapped_model = unwrapped_model.to(dtype=weight_dtype) + unwrapped_text_encoder = unwrapped_text_encoder.to(dtype=weight_dtype) + + # Create scheduler for inference + inference_scheduler = Scheduler( + mask_token_id=video_tokenizer.mask_token_id, + masking_schedule="cosine" + ) + inference_scheduler.set_timesteps( + num_inference_steps=args.num_inference_steps, + device=accelerator.device + ) + + # Create pipeline + pipe = VideoPipeline( + tokenizer=tokenizer, + text_encoder=unwrapped_text_encoder, + transformer=unwrapped_model, + scheduler=inference_scheduler, + video_tokenizer=video_tokenizer, + text_len=512, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + ) + pipe = pipe.to(accelerator.device) + + # Generate videos + with torch.no_grad(): + videos = pipe( + prompt=inference_prompts, + num_frames=args.num_frames, + height=args.video_height, + width=args.video_width, + guidance_scale=9.0, + num_inference_steps=args.num_inference_steps, + output_type="pil", + ).videos + + # Save videos + inference_dir = os.path.join(args.output_dir, f"inference_step_{global_step}") + os.makedirs(inference_dir, exist_ok=True) + + for i, (video, prompt) in enumerate(zip(videos, inference_prompts)): + # Sanitize prompt for filename (remove special chars, limit length) + safe_prompt = "".join(c for c in prompt[:50] if c.isalnum() or c in (' ', '-', '_')).strip().replace(' ', '_')[:30] + if not safe_prompt: + safe_prompt = f"prompt_{i}" + output_path = os.path.join(inference_dir, f"step{global_step}_video{i}_{safe_prompt}") + save_video_frames(video, output_path, prompt_text=prompt) + + logger.info(f"Saved inference videos to {inference_dir}") + logger.info(f"=" * 80) + logger.info(f"Step {global_step} Inference Results:") + logger.info(f" - Videos saved to: {inference_dir}") + logger.info(f" - Prompts used: {inference_prompts}") + logger.info(f" - Check videos to observe:") + logger.info(f" * Structure: When do coherent shapes/objects appear?") + logger.info(f" * Motion: When does temporal consistency emerge?") + logger.info(f" * Condition alignment: When do videos match prompts?") + logger.info(f"=" * 80) + + # Set model back to training mode + model.train() + + except Exception as e: + logger.error(f"Inference failed at step {global_step}: {e}") + import traceback + traceback.print_exc() + model.train() # Ensure model is back in training mode + + if global_step >= args.max_train_steps: + break + + if global_step >= args.max_train_steps: + break + + logger.info("Overfitting experiment completed!") + logger.info(f"Final loss: {loss.item():.4f}") + logger.info("If loss dropped to 5-6 or lower, implementation is likely correct.") + logger.info("If loss stayed high (9-10) or diverged, check for bugs in mask_token, scheduler, or label alignment.") + + +if __name__ == "__main__": + main() + diff --git a/Meissonic/train/train_video.sh b/Meissonic/train/train_video.sh new file mode 100644 index 0000000000000000000000000000000000000000..dc650c62ad3b1d153794a597c4e1d0a768d116b2 --- /dev/null +++ b/Meissonic/train/train_video.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# 8-GPU training script for video diffusion model +# Usage: bash train/train_video.sh + +accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \ + train/train_mei_video.py \ + --use_precomputed_features \ + --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \ + --text_encoder_architecture umt5-xxl \ + --wan_pretrained_path Wan-AI/Wan2.1-T2V-1.3B \ + --training_from_scratch True \ + --pretrained_model_name_or_path "dummy" \ + --wan_backbone_lr_ratio 0.2 \ + --num_frames 17 \ + --video_height 128 \ + --video_width 128 \ + --dataloader_num_workers 8 \ + --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ + --instance_dataset OpenVid1MDataset \ + --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ + --train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --learning_rate 3e-4 \ + --max_train_steps 10000 \ + --checkpointing_steps 500 \ + --validation_steps 500 \ + --logging_steps 10 \ + --validation_prompts "a cat playing" "a girl walking" \ + --output_dir "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio" \ + --mixed_precision bf16 \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --use_8bit_adam \ + --gradient_checkpointing \ + --min_masking_rate 0.0 \ + --cond_dropout_prob 0.1 \ + --split_vae_encode 1 \ + --allow_tf32 \ + --seed 42 \ + --report_to wandb + + # --use_precomputed_features \ + # --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \ + +# accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \ +# train/train_mei_video.py \ +# --use_precomputed_features \ +# --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \ +# --text_encoder_architecture umt5-xxl \ +# --wan_pretrained_path Wan-AI/Wan2.1-T2V-1.3B \ +# --training_from_scratch True \ +# --pretrained_model_name_or_path "dummy" \ +# --wan_backbone_lr_ratio 1 \ +# --num_frames 4 \ +# --video_height 256 \ +# --video_width 448 \ +# --dataloader_num_workers 8 \ +# --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ +# --instance_dataset OpenVid1MDataset \ +# --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ +# --train_batch_size 1 \ +# --gradient_accumulation_steps 1 \ +# --learning_rate 3e-4 \ +# --max_train_steps 10000 \ +# --checkpointing_steps 500 \ +# --validation_steps 500 \ +# --logging_steps 10 \ +# --validation_prompts "a cat playing" "a girl walking" \ +# --output_dir "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp" \ +# --mixed_precision bf16 \ +# --lr_scheduler constant \ +# --lr_warmup_steps 0 \ +# --use_8bit_adam \ +# --gradient_checkpointing \ +# --min_masking_rate 0.0 \ +# --cond_dropout_prob 0.1 \ +# --split_vae_encode 1 \ +# --allow_tf32 \ +# --seed 42 \ +# --report_to wandb + +# --pretrained_model_name_or_path "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000" \ \ No newline at end of file diff --git a/Meissonic/train/train_video_stage_1.sh b/Meissonic/train/train_video_stage_1.sh new file mode 100644 index 0000000000000000000000000000000000000000..05a90fee0d74822abcb0a7018408a2eac5117236 --- /dev/null +++ b/Meissonic/train/train_video_stage_1.sh @@ -0,0 +1,42 @@ + + +# Overall Batch size: 8 (per GPU) * 4 (gradient accumulation) * 96 (GPU cards) = 3072 +# We set max steps to 100k, but we may not need so much, depends on actual performace +# typically 20s for 10 steps, so we can run 1800 steps per hour and 43.2k steps per day, which equals to 132 epochs + +accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \ + train/train_mei_video.py \ + --use_precomputed_features \ + --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \ + --text_encoder_architecture umt5-xxl \ + --wan_pretrained_path Wan-AI/Wan2.1-T2V-1.3B \ + --training_from_scratch True \ + --pretrained_model_name_or_path "dummy" \ + --wan_backbone_lr_ratio 0.2 \ + --num_frames 17 \ + --video_height 128 \ + --video_width 128 \ + --dataloader_num_workers 8 \ + --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ + --instance_dataset OpenVid1MDataset \ + --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ + --train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --learning_rate 3e-4 \ + --max_train_steps 100000 \ + --checkpointing_steps 500 \ + --validation_steps 500 \ + --logging_steps 10 \ + --validation_prompts "a cat playing" "a girl walking" \ + --output_dir "./output_128x128_17f_8*4bs_4*8*8vqvae_0_2_ratio" \ + --mixed_precision bf16 \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --use_8bit_adam \ + --gradient_checkpointing \ + --min_masking_rate 0.0 \ + --cond_dropout_prob 0.1 \ + --split_vae_encode 1 \ + --allow_tf32 \ + --seed 42 \ + --report_to wandb \ No newline at end of file diff --git a/Meissonic/train/trainer_utils.py b/Meissonic/train/trainer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a4956a83dd94a140cab80c0baedd906713523f82 --- /dev/null +++ b/Meissonic/train/trainer_utils.py @@ -0,0 +1,45 @@ +# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +from pathlib import Path + + +def save_checkpoint(args, accelerator, global_step, logger): + output_dir = args.output_dir + + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if accelerator.is_main_process and args.checkpoints_total_limit is not None: + checkpoints = os.listdir(output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + + save_path = Path(output_dir) / f"checkpoint-{global_step}" + accelerator.save_state(save_path) + logger.info(f"Saved state to {save_path}") diff --git a/Meissonic/vidtok_cache/VidTok/.gitignore b/Meissonic/vidtok_cache/VidTok/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..81849eac5c147c73d4202463658bd4b49571d163 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/.gitignore @@ -0,0 +1,167 @@ +amlt +.amltconfig +checkpoints +logs +wandb +tmp + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/Meissonic/vidtok_cache/VidTok/CODE_OF_CONDUCT.md b/Meissonic/vidtok_cache/VidTok/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..f9ba8cf65f3e3104dd061c178066ec8247811f33 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/Meissonic/vidtok_cache/VidTok/LICENSE b/Meissonic/vidtok_cache/VidTok/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9e841e7a26e4eb057b24511e7b92d42b257a80e5 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/Meissonic/vidtok_cache/VidTok/README.md b/Meissonic/vidtok_cache/VidTok/README.md new file mode 100644 index 0000000000000000000000000000000000000000..edd9df5e2319e6dab35604d0d67192381dfacabf --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/README.md @@ -0,0 +1,472 @@ +
+ +# VidTok
A Family of Versatile and State-Of-The-Art Video Tokenizers + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2412.13061)   [![GitHub](https://img.shields.io/badge/GitHub-Code-blue?logo=github&logoColor=white)](https://github.com/microsoft/VidTok)   [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/microsoft/VidTok) +
+ +--- + +![radar](assets/radar.png) + + We introduce VidTok, a cutting-edge family of video tokenizers that excels in both continuous and discrete tokenizations. VidTok incorporates several key advancements over existing approaches: + * ⚡️ **Efficient Architecture**. Separate spatial and temporal sampling reduces computational complexity without sacrificing quality. + * 🔥 **Advanced Quantization**. Finite Scalar Quantization (FSQ) addresses training instability and codebook collapse in discrete tokenization. + * 💥 **Enhanced Training**. A two-stage strategy—pre-training on low-res videos and fine-tuning on high-res—boosts efficiency. Reduced frame rates improve motion dynamics representation. + +VidTok, trained on a large-scale video dataset, outperforms previous models across all metrics, including PSNR, SSIM, LPIPS, and FVD. + +https://github.com/user-attachments/assets/a3341037-130d-4a83-aba6-c3daeaf66932 + +## 🔥 News +- August, 2025: 🚀 Introduced spatial tiling for large resolutions (>256), reducing GPU memory usage to ~6 GB when encoding and decoding a 17 × 768 × 768 video. +* March, 2025: 🚀 [VidTwin](https://github.com/microsoft/VidTok/tree/main/vidtwin) has been accepted by CVPR 2025, and the [checkpoint](https://huggingface.co/microsoft/vidtwin) was released! +* March, 2025: 🚀 [VidTok v1.1](#-updates-in-vidtok-v11) was released! We fine-tuned all causal models on long videos to support tokenization and reconstruction of videos of arbitrary length with fine temporal smoothness. [Relevant checkpoints](https://huggingface.co/microsoft/VidTok/tree/main/checkpoints/vidtok_v1_1) are continuously updating. +* December, 2024: 🚀 [VidTwin](https://github.com/microsoft/VidTok/tree/main/vidtwin) was released! +* December, 2024: 🚀 [VidTok](https://github.com/microsoft/vidtok) was released! + + +## 💥 Updates in VidTok v1.1 +> VidTok v1.1 is an update for causal models. We fine-tuned all causal models on long videos to support tokenization and reconstruction of videos of arbitrary length with fine temporal smoothness. See performance [here](#v11-performance). + +### v1.1: Long Video Reconstruction +Run the following inference script to [reconstruct an input video](#reconstruct-an-input-video): +```bash +python scripts/inference_reconstruct.py --config CONFIG_v1_1 --ckpt CKPT_v1_1 --input_video_path VIDEO_PATH --input_height 256 --input_width 256 --sample_fps 30 --chunk_size CHUNK_SIZE --output_video_dir OUTPUT_DIR --read_long_video +# Set `CHUNK_SIZE` according to your GPU memory, recommendly 16. +``` +and run the following inference script to [evaluate the reconstruction performance](#performance-evaluation): +```bash +python scripts/inference_evaluate.py --config CONFIG_v1_1 --ckpt CKPT_v1_1 --data_dir DATA_DIR --input_height 256 --input_width 256 --sample_fps 30 --chunk_size CHUNK_SIZE --read_long_video +# Set `CHUNK_SIZE` according to your GPU memory, recommendly 16. +``` + +For an easy usage of VidTok v1.1 models, refer to [this script](#easy-usage) and make the following revision: +```python +# Use VidTok v1.1 models +cfg_path = "configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml" +ckpt_path = "checkpoints/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.ckpt" + +... + +model.to('cuda').eval() +# Using tiling inference to save memory usage +model.use_tiling = True +model.t_chunk_enc = 16 +model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor +model.use_overlap = True +# random input: long video +x_input = (torch.rand(1, 3, 129, 256, 256) * 2 - 1).to('cuda') + +... + +if x_recon.shape[2] != x_input.shape[2]: + x_recon = x_recon[:, :, -x_input.shape[2]:, ...] +``` + +### v1.1: Long Video Fine-tuning +Follow this [training guidance](#fine-tune-on-custom-data) to fine-tune on your custom long video data and note that: +- Compared to VidTok v1.0, we tend to use longer sequences to fine-tune the model (for example, setting `NUM_FRAMES_1` to 33, 49, or larger). +- The resolution and the sequence length of training data should be adjusted according to GPU memory. + +### v1.1: Performance +| Model | Regularizer | Causal | VCR | PSNR | SSIM | LPIPS | FVD | +|------|------|------|------|------|------|------|------| +| [vidtok_kl_causal_488_16chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt) | KL-16chn | ✔️ | 4x8x8 | 35.13 | 0.941 | 0.049 | 87.4 | +| [vidtok_kl_causal_41616_16chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.ckpt) | KL-16chn | ✔️ | 4x16x16 | 29.61 | 0.854 | 0.113 | 162.7 | +| [vidtok_kl_causal_288_8chn_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.ckpt) | KL-8chn | ✔️ | 2x8x8 | 34.59 | 0.935 | 0.051 | 78.2 | +| [vidtok_fsq_causal_488_32768_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.ckpt) | FSQ-32,768 | ✔️ | 4x8x8 | 29.39 | 0.856 | 0.114 | 168.5 | +| [vidtok_fsq_causal_888_32768_v1_1](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.ckpt) | FSQ-32,768 | ✔️ | 8x8x8 | 27.95 | 0.817 | 0.142 | 293.2 | +- This is the evaluation result of long video reconstruction conducted on each complete video in [MCL_JCL](https://mcl.usc.edu/mcl-jcv-dataset/) dataset, with a sample fps of 30 and a resolution of `256x256`. + + +## 🔧 Setup +1. Clone this repository and navigate to VidTok folder: +```bash +git clone https://github.com/microsoft/VidTok +cd VidTok +``` +2. We provide an `environment.yaml` file for setting up a Conda environment. Conda's installation instructions are available [here](https://docs.anaconda.com/miniconda/index.html). +```bash +# 1. Prepare conda environment +conda env create -f environment.yaml +# 2. Activate the environment +conda activate vidtok +``` + +We recommend using 1+ high-end GPU for training and inference. We have done all testing and development using A100 and MI300X GPUs. For convenience, we also provide prebuilt [Docker](https://hub.docker.com/) images with required dependencies. You can use it as follows: + +```bash +# NVIDIA GPUs +docker run -it --gpus all --shm-size 256G --rm -v `pwd`:/workspace --workdir /workspace \ + deeptimhe/ubuntu22.04-cuda12.1-python3.10-pytorch2.5:orig-vidtok bash +# AMD GPUs +docker run -it --gpus all --shm-size 256G --rm -v `pwd`:/workspace --workdir /workspace \ + deeptimhe/ubuntu22.04-rocm6.2.4-python3.10-pytorch2.5:orig-vidtok bash +``` + +## 🎈 Checkpoints +Download pre-trained models [here](https://huggingface.co/microsoft/VidTok/tree/main/checkpoints), and put them in `checkpoints` folder, like: +``` +└── checkpoints + ├── vidtok_v1_1 + │ ├── vidtok_kl_causal_488_16chn_v1_1.ckpt + │ └── ... + ├── vidtok_fsq_causal_41616_262144.ckpt + ├── vidtok_fsq_causal_488_262144.ckpt + ├── vidtok_fsq_causal_488_32768.ckpt + ├── vidtok_fsq_causal_488_4096.ckpt + ├── vidtok_fsq_noncausal_41616_262144.ckpt + ├── vidtok_fsq_noncausal_488_262144.ckpt + ├── vidtok_kl_causal_288_8chn.ckpt + ├── vidtok_kl_causal_41616_4chn.ckpt + ├── vidtok_kl_causal_444_4chn.ckpt + ├── vidtok_kl_causal_488_16chn.ckpt + ├── vidtok_kl_causal_488_4chn.ckpt + ├── vidtok_kl_causal_488_8chn.ckpt + ├── vidtok_kl_noncausal_41616_16chn.ckpt + ├── vidtok_kl_noncausal_41616_4chn.ckpt + ├── vidtok_kl_noncausal_488_16chn.ckpt + └── vidtok_kl_noncausal_488_4chn.ckpt +``` +Each checkpoint has a corresponding config file with the same name in `configs` folder. + + +## 🔆 Performance + +| Model | Regularizer | Causal | VCR | PSNR | SSIM | LPIPS | FVD | +|------|------|------|------|------|------|------|------| +| [vidtok_kl_causal_488_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_4chn.ckpt) | KL-4chn | ✔️ | 4x8x8 | 29.64 | 0.852| 0.114| 194.2| +| [vidtok_kl_causal_488_8chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_8chn.ckpt) | KL-8chn | ✔️ |4x8x8 | 31.83 | 0.897| 0.083| 109.3| +| [vidtok_kl_causal_488_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_488_16chn.ckpt) | KL-16chn | ✔️ | 4x8x8 | 35.04 |0.942 |0.047 | 78.9| +| [vidtok_kl_causal_288_8chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_288_8chn.ckpt) | KL-8chn | ✔️ | 2x8x8 | 33.86 | 0.928 |0.057 | 80.7 | +| [vidtok_kl_causal_444_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_444_4chn.ckpt) | KL-4chn | ✔️ | 4x4x4 | 34.78 | 0.941 | 0.051| 87.2| +| [vidtok_kl_causal_41616_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_causal_41616_4chn.ckpt) | KL-4chn | ✔️ | 4x16x16 | 25.05 | 0.711| 0.228| 549.1| +| [vidtok_kl_noncausal_488_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_488_4chn.ckpt) | KL-4chn | ✖️ | 4x8x8 | 30.60 | 0.876 | 0.098| 157.9| +| [vidtok_kl_noncausal_488_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_488_16chn.ckpt) | KL-16chn | ✖️ | 4x8x8 | 36.13 | 0.950 | 0.044| 60.5| +| [vidtok_kl_noncausal_41616_4chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_41616_4chn.ckpt) | KL-4chn | ✖️ | 4x16x16 | 26.06 | 0.751 | 0.190|423.2 | +| [vidtok_kl_noncausal_41616_16chn](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_kl_noncausal_41616_16chn.ckpt) | KL-16chn | ✖️ | 4x16x16 | 30.69 | 0.878 | 0.095| 147.1| +| [vidtok_fsq_causal_488_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_262144.ckpt) | FSQ-262,144 | ✔️ | 4x8x8 | 29.82 | 0.867 |0.106 | 160.1| +| [vidtok_fsq_causal_488_32768](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_32768.ckpt) | FSQ-32,768 | ✔️ | 4x8x8 | 29.16 | 0.854 | 0.117| 196.9| +| [vidtok_fsq_causal_488_4096](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_488_4096.ckpt) | FSQ-4096 | ✔️ | 4x8x8 | 28.36 | 0.832 | 0.133| 218.1| +| [vidtok_fsq_causal_41616_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_causal_41616_262144.ckpt) | FSQ-262,144 | ✔️ | 4x16x16 | 25.38 | 0.738 |0.206 | 430.1| +| [vidtok_fsq_noncausal_488_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_noncausal_488_262144.ckpt) | FSQ-262,144 | ✖️ | 4x8x8 | 30.78 | 0.889| 0.091| 132.1| +| [vidtok_fsq_noncausal_41616_262144](https://huggingface.co/microsoft/VidTok/blob/main/checkpoints/vidtok_fsq_noncausal_41616_262144.ckpt) | FSQ-262,144 | ✖️ | 4x16x16 | 26.37 | 0.772| 0.171| 357.0| + +- `VCR` indicates the video compression ratio `TxHxW`. +- The above table shows model performance evaluated on 30 test videos in [MCL_JCL](https://mcl.usc.edu/mcl-jcv-dataset/) dataset, with a sample fps of 30. The input size is `17x256x256` for causal models and `16x256x256` for non-causal models. + +## 🔛 Training + +### Data Preparation +1. Put all training videos under `DATA_DIR`: +``` +└── DATA_DIR + ├── subset1 + │ ├── videoname11.mp4 + │ └── videoname12.mp4 + ├── subset2 + │ ├── videoname21.mp4 + │ ├── videoname22.mp4 + │ └── subsubset1 + │ ├── videoname211.mp4 + │ └── videoname212.mp4 + └── ... +``` +2. Prepare a `.csv` meta file to record the relative paths of these videos with respect to `DATA_DIR`, like: +``` +videos +subset1/videoname11.mp4 +subset2/videoname21.mp4 +subset2/subsubset1/videoname211.mp4 +``` + +> Validation data is also prepared following the above steps. + +### Fine-tune on Custom Data +1. Prepare your own training and validation data following [Data Preparation](#data-preparation). +2. Select the appropriate `CONFIG` file from `configs` folder based on your needs, and modify the following parameters: + - Specify the `ckpt_path` parameter to initialize the model with pre-trained checkpoint parameters: + ```yaml + model: + params: + ckpt_path: PATH_TO_CHECKPOINT # train from existing checkpoint + ``` + - Specify the `data` section to use your own training and validation data: + ```yaml + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: NUM_FRAMES_1 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_1 # sample fps for training data + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: NUM_FRAMES_2 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data + ``` + - Set `fix_encoder` and `fix_decoder` to be `False` to enable full model fine-tuning: + ```yaml + model: + params: + encoder_config: + params: + fix_encoder: false + fix_decoder: false + ``` + - Other hyperparameters according to your needs. + +3. Run the following command to start training: +```bash +python main.py -b CONFIG --logdir LOGDIR + +# You can also use `torchrun` to start the training code. +``` +Training logs and checkpoints are saved in `LOGDIR`. + +It is recommended to use [Weights & Biases](https://wandb.ai/site) as the data visualization tool ([TensorBoard](https://www.tensorflow.org/tensorboard) by default). Use `wandb login` to log in first, and then run: +```bash +python main.py -b CONFIG --logdir LOGDIR --wandb --wandb_entity ENTITY --wandb_project PROJECT +``` + + +### Train from Scratch +
+Two-stage Training +We adopt a two-stage training strategy to improve training efficiency: initially pre-training the full model on low-resolution videos, followed by fine-tuning only the decoder on high-resolution videos. + +| First Stage | Second Stage | Fix encoder | PSNR | SSIM | LPIPS | GPU Hours| +|------|------|------|------|------|------|------| +| 256 x 256 | - | - | 29.19 | 0.843 | 0.127| 3,072| +| 128 x 128 | 256 x 256 | ✔️ | 29.21 | 0.843 | 0.125| 1,536| + +1. Prepare your own training and validation data following [Data Preparation](#data-preparation). +2. Select the appropriate `CONFIG` file from `configs` folder based on your needs, and specify the `data` section to use your own training and validation data: + ```yaml + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 # vary in different training stages + input_width: INPUT_WIDTH_1 # vary in different training stages + sample_num_frames: NUM_FRAMES_1 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_1 # sample fps for training data + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: NUM_FRAMES_2 # typically set to 17 for causal models and 16 for non-causal models + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data + ``` + +3. Start the first stage of training. First, revise the `CONFIG` file to enable full model training with low-resolution data: +```yaml +model: + params: + # ckpt_path: # disable this parameter so as to train from scratch + encoder_config: + params: + fix_encoder: false + fix_decoder: false +data: + params: + train: + params: + video_params: + input_height: 128 + input_width: 128 +``` +Then revise other hyperparameters according to your needs, and run the training command to start training as in [Fine-tune on Custom Data](#fine-tune-on-custom-data). We train VidTok for 50,000 steps with batch size 16 in this stage. + +4. Start the second stage of training. First, revise the `CONFIG` file to enable the fine-tuning of the decoder with high-resolution data: +```yaml +model: + params: + ckpt_path: CKPT_PATH # path to the saved checkpoint after the first stage of training + encoder_config: + params: + fix_encoder: true + fix_decoder: false +data: + params: + train: + params: + video_params: + input_height: 256 + input_width: 256 +``` +Then revise other hyperparameters according to your needs, and run the training command to start training as in [Fine-tune on Custom Data](#fine-tune-on-custom-data). We train VidTok for 30,000 steps with batch size 8 in this stage. +
+ + +## 🚀 Inference + +### Easy Usage +We provide the following example for a quick usage of our models. It works for both continuous and discrete tokenization and both causal and non-causal models. +Just provide the path to the configuration file `cfg_path` and checkpoint file `ckpt_path`. +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +cfg_path = "configs/vidtok_kl_causal_488_4chn.yaml" +ckpt_path = "checkpoints/vidtok_kl_causal_488_4chn.ckpt" + +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to('cuda').eval() +# random input +num_frames = 17 if model.is_causal else 16 +x_input = (torch.rand(1, 3, num_frames, 256, 256) * 2 - 1).to('cuda') # [B,C,T,H,W], range -1~1 +# model forward +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + _, x_recon, _ = model(x_input) +assert x_input.shape == x_recon.shape +``` +If you want to directly infer from latent tokens, run the following code: +```python +z, reg_log = model.encode(x_input, return_reg_log=True) +# infer from continuous latent space +x_recon = model.decode(z) +# infer from discrete latent tokens +x_recon = model.decode(reg_log['indices'], decode_from_indices=True) +``` + +### Use Torch Compile to Speed Up Inference +Use compiled components in VidTok can speed up inference by as much as 2X. The following code snippet demonstrates how to compile our models. + +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +torch._inductor.config.cpp.weight_prepack=True +torch._inductor.config.freezing=True + +cfg_path = "configs/vidtok_kl_causal_488_4chn.yaml" +ckpt_path = "checkpoints/vidtok_kl_causal_488_4chn.ckpt" + +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to('cuda').eval() +model.encoder = torch.compile(model.encoder) +model.decoder = torch.compile(model.decoder) + +# random input +num_frames = 17 if model.is_causal else 16 +x_input = (torch.rand(1, 3, num_frames, 256, 256) * 2 - 1).to('cuda') # [B,C,T,H,W], range -1~1 + +# Warm Up +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + _, x_recon, _ = model(x_input) + +torch.cuda.synchronize() +import time +start = time.time() +with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16): + for i in range(10): + _, x_recon, _ = model(x_input) +torch.cuda.synchronize() +print(f"Average inference time: {(time.time() - start)/10 :.4f} seconds") +``` + +### Reconstruct an Input Video +```bash +python scripts/inference_reconstruct.py --config CONFIG --ckpt CKPT --input_video_path VIDEO_PATH --input_height 256 --input_width 256 --sample_fps 30 --output_video_dir OUTPUT_DIR +``` +- Specify `VIDEO_PATH` to the path of your test video. We provide an example video in `assets/example.mp4`. +- The reconstructed video is saved in `OUTPUT_DIR`. +- For causal models, you can choose to add `--pad_gen_frames` to the command line, which may improve the smoothness of the reconstructed video. + +### Performance Evaluation +We also provide a manuscript `scripts/inference_evaluate.py` to evaluate the video reconstruction performance in PSNR, SSIM and LPIPS. + +1. Put all of your test videos under `DATA_DIR`. +2. Run the following command, and all `.mp4` videos under `DATA_DIR` will be tested: +```bash +python scripts/inference_evaluate.py --config CONFIG --ckpt CKPT --data_dir DATA_DIR --input_height 256 --input_width 256 --sample_fps 30 +``` +(Optional) If you only want to test certain videos under `DATA_DIR`, you need to prepare a `.csv` meta file +to indicate the video files to be tested (refer to [Data Preparation](#data-preparation)). And add `--meta_path META_PATH` to the above command to specify the path to the `.csv` meta file. + +## 💡 Intended Uses + +We are sharing our model with the research community to foster further research in this area: +* Training your own video tokenizers for research purpose. +* Video tokenization with various compression rates. + + +## 🪧 Out-of-scope Uses + +Our models are not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of video tokenizers (e.g., performance degradation on out-of-domain data) as they select use cases, and evaluate and mitigate for privacy, safety, and fairness before using within a specific downstream use case, particularly for high-risk scenarios. + +Developers should be aware of and adhere to applicable laws or regulations (including privacy, trade compliance laws, etc.) that are relevant to their use case. + + +## 🤖️ Risks and Limitations + +Some of the limitations of this model to be aware of include: +* VidTok may lose detailed information on the reconstructed content. +* VidTok inherits any biases, errors, or omissions characteristic of its training data. +* VidTok was developed for research and experimental purposes. Further testing and validation are needed before considering its application in commercial or real-world scenarios. + + +## 🤗 Acknowledgments + +This codebase borrows code from [generative-models](https://github.com/Stability-AI/generative-models). We thank Stability AI for its efforts and innovations, which have made the development process more efficient and convenient. + +Thank you to everyone who contributed their wisdom and efforts to this project. + +## ✏️ BibTeX + +```bibtex +@article{tang2024vidtok, + title={VidTok: A Versatile and Open-Source Video Tokenizer}, + author={Tang, Anni and He, Tianyu and Guo, Junliang and Cheng, Xinle and Song, Li and Bian, Jiang}, + year={2024}, + journal={arXiv preprint arXiv:2412.13061}, +} +``` + +## ☎️ Contact + +We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at tianyuhe@microsoft.com. + +## 📄 Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + + +## 📍 Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/Meissonic/vidtok_cache/VidTok/SECURITY.md b/Meissonic/vidtok_cache/VidTok/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..b3c89efc852e22f71eabf5dfbc6ac62493425eb6 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). + + diff --git a/Meissonic/vidtok_cache/VidTok/SUPPORT.md b/Meissonic/vidtok_cache/VidTok/SUPPORT.md new file mode 100644 index 0000000000000000000000000000000000000000..291d4d43733f4c15a81ff598ec1c99fd6c18f64c --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/SUPPORT.md @@ -0,0 +1,25 @@ +# TODO: The maintainer of this repo has not yet edited this file + +**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? + +- **No CSS support:** Fill out this template with information about how to file issues and get help. +- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. +- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. + +*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* + +# Support + +## How to file issues and get help + +This project uses GitHub Issues to track bugs and feature requests. Please search the existing +issues before filing new issues to avoid duplicates. For new issues, file your bug or +feature request as a new Issue. + +For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE +FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER +CHANNEL. WHERE WILL YOU HELP PEOPLE?**. + +## Microsoft Support Policy + +Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/Meissonic/vidtok_cache/VidTok/assets/example.mp4 b/Meissonic/vidtok_cache/VidTok/assets/example.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..21be26c8a8f5dbb76de5a225e5b284fdbb024904 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/assets/example.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:588ca89fae7320a079d4f77cf963f88075959f06310594ab35d3e04b844c4d50 +size 540937 diff --git a/Meissonic/vidtok_cache/VidTok/assets/gemini.png b/Meissonic/vidtok_cache/VidTok/assets/gemini.png new file mode 100644 index 0000000000000000000000000000000000000000..fe326dc4b4b4b6db82575b15e9b4bf2f3b63e63d Binary files /dev/null and b/Meissonic/vidtok_cache/VidTok/assets/gemini.png differ diff --git a/Meissonic/vidtok_cache/VidTok/assets/radar.png b/Meissonic/vidtok_cache/VidTok/assets/radar.png new file mode 100644 index 0000000000000000000000000000000000000000..6eff9338928faf73c0e39e859237bc6c588ec183 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/assets/radar.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cfdef783e26264ff671f81845b54471b237f5cd7df0dbd63642fde0c20f935e +size 424581 diff --git a/Meissonic/vidtok_cache/VidTok/assets/vidtwin.png b/Meissonic/vidtok_cache/VidTok/assets/vidtwin.png new file mode 100644 index 0000000000000000000000000000000000000000..a7ae41105384d077a7d128d1d687e3ee8e17fbd8 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/assets/vidtwin.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e58210f7ca7784a4df737f8a2ece2b5e9f85fff3bf423b24aa6aeeb0b196cef +size 549933 diff --git a/Meissonic/vidtok_cache/VidTok/assets/vidtwin_demo.png b/Meissonic/vidtok_cache/VidTok/assets/vidtwin_demo.png new file mode 100644 index 0000000000000000000000000000000000000000..ca79126104a7cbfb67c0acb70f03c6e66546675c --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/assets/vidtwin_demo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8e15c46050cc7957bc5d334bd1902d58b9be9aa7669df82f2ae9ba08c90585 +size 6371846 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..80b2edbebf97fd747933000d32ccbdfe0e135702 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_41616_262144.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_41616_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_262144.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26f9c3e94ed8b64a681b2ed3887929109b6ccc53 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_262144.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_32768.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_32768.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e661c533b367c7be4eea8de629e074b395e1684 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_32768.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_32768.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_4096.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_4096.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bf654bb21c1bd67ddde3f4878497c6ad780503a --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_causal_488_4096.yaml @@ -0,0 +1,118 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_causal_488_4096.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: false + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8] # codebook size: 8*8*8*8=4096 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e400ff097155465e2a477bfbc3ca32f346a0ca12 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_41616_262144.yaml @@ -0,0 +1,117 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_noncausal_41616_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c8731af435aae0efbde9716ee696ce3efd30d25 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_fsq_noncausal_488_262144.yaml @@ -0,0 +1,117 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_fsq_noncausal_488_262144.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_288_8chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_288_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c20bf1aaf2aa97f67d19d0de1dbac47cf2d55ac --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_288_8chn.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_288_8chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [1] + tempo_us: [2] + time_downsample_factor: 2 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1db41cd612f5e6b1390bebda33e559e347b60907 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_41616_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_41616_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_444_4chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_444_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bcbbd11cba27e60bd2f44866655adcfff91f91a6 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_444_4chn.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_444_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + spatial_ds: [1, 2] + spatial_us: [1, 2] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_16chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7527679fe0766616df45223fe1e9595101fadc5 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_16chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_4chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d14b0d75435b24affad90095e4d6e42c6525a8d --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_8chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..336cbac8319dc3f9c232c02889e598fdf098a777 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_causal_488_8chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_8chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b32064f1589eb3b21cb978f808a95fd6ade31ff9 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_16chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_41616_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdd33a5ebc635cbf9a04fb94006d14a473d87a3c --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_41616_4chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_41616_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47a1090f42b07655626f0d522d756157fdeb1d6a --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_16chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_488_16chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05bb8c54df1374c0f67c79d256195dd6451acfbd --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_kl_noncausal_488_4chn.yaml @@ -0,0 +1,111 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_noncausal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dnoncausal.Encoder3D + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false + fix_decoder: false + + decoder_config: + target: vidtok.modules.model_3dnoncausal.Decoder3D + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 16 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 16 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74364fa3209abbda65b0c23311d92ad0975570dd --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.yaml @@ -0,0 +1,120 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_41616_262144_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 6 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8*8=262144 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..334749091ff2c561208f169029eae4704d4213e3 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.yaml @@ -0,0 +1,120 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_488_32768_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2966072ff2a073d404d0cb438674acfc92319033 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.yaml @@ -0,0 +1,122 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_fsq_causal_888_32768_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: false + z_channels: 5 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [0, 1, 2] + tempo_us: [1, 2, 3] + time_downsample_factor: 8 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.FSQRegularizer + params: + levels: [8, 8, 8, 8, 8] # codebook size: 8*8*8*8*8=32768 + entropy_loss_weight: 0.1 + entropy_loss_annealing_steps: 2000 + entropy_loss_annealing_factor: 3 + commitment_loss_weight: 0.25 + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..deefcc29ecd8212e8bd3edd9d9870d8c64079db7 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.yaml @@ -0,0 +1,116 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_288_8chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 8 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + tempo_ds: [1] + tempo_us: [2] + time_downsample_factor: 2 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..965b243859e513244d3d3fd9cc68aa27aee887da --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_41616_16chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26f68342b48dd5fcae8e5bfcd2fcb5cc5bd1ab1c --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_488_16chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 16 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4badafed742c03fd7850f197cdb8207c59b992e --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.yaml @@ -0,0 +1,114 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder_v1_1.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_v1_1/vidtok_kl_causal_488_4chn_v1_1.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + use_tiling: False + + encoder_config: + target: vidtok.modules.model_3dcausal_v1_1.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + interpolation_mode: trilinear # nearest, trilinear + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal_v1_1.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 1 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 33 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 33 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/vidtok_cache/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml b/Meissonic/vidtok_cache/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bcd59650c229ac57560602854015262969d658b --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml @@ -0,0 +1,154 @@ +model: + base_learning_rate: 1.6e-4 + target: vidtwin.models.vidtwin_ae.VidAutoEncoderQformerCompactSymVidVAE + params: + input_key: jpg + monitor: val/rec_loss + ckpt_path: PATH_TO_CHECKPOINT + ignore_keys: [] + expect_ch: 8 + cont_num_blocks: 1 + downsample_motion: True + motion_num_blocks: 1 + d_dim: 8 + + temporal_qformer_config: + target: vidtwin.modules.qformer.MyQformerInterface + params: + num_query_tokens: 16 + query_hidden_size: 64 + encoder_hidden_size: 768 + + encoder_config: + target: vidtwin.modules.st_transformer.STTEncoder + params: + in_channels: 3 + input_size: [16, 224, 224] + patch_size: [1, 16, 16] + hidden_size: 768 + depth: 16 + num_heads: 12 + temporal_casual: true + + decoder_config: + target: vidtwin.modules.st_transformer.STTDecoder + params: + in_channels: 3 + input_size: [16, 224, 224] + patch_size: [1, 16, 16] + hidden_size: 768 + depth: 16 + num_heads: 12 + temporal_casual: true + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + perceptual_weight: 0.05 + disc_start: 20001 + disc_weight: 0.05 + learn_logvar: True + dims: 3 + disc_type: 2d + regularization_weights: + kl_loss: 0.001 + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + params: + sample: True + + + lr_scheduler_config_d: + target: vidtok.models.vidtwin_ae.LambdaWarmUpCosineScheduler + params: + lr_min: 0 + lr_max: 1.5e-05 + lr_start: 1.0e-05 + warmup_steps: 5000 + lr_scheduler_config_g: + target: vidtok.models.vidtwin_ae.LambdaWarmUpCosineScheduler + params: + lr_min: 0 + lr_max: 3.0e-05 + lr_start: 0 + warmup_steps: 5000 + optimizer_config: + target: torch.optim.AdamW + params: + betas: + - 0 + - 0.9 + weight_decay: 0.0001 + lr_scheduler_config: + target: inverse_sqrt + params: + num_warmup_steps: 2000 + frequency: 1 + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: 224 + input_width: 224 + sample_num_frames: 16 + sample_fps: 8 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: 224 + input_width: 224 + sample_num_frames: 16 + sample_fps: 8 + start_index: 0 + + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: True + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 2 + + + + trainer: + # precision: bf16-mixed # 16-mixed + benchmark: True + devices: 4 + num_sanity_val_steps: 10 + val_check_interval: 5000 + accumulate_grad_batches: 1 + max_epochs: 10 diff --git a/Meissonic/vidtok_cache/VidTok/environment.yaml b/Meissonic/vidtok_cache/VidTok/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a6abf1dcc7cefdab27b272681c0395791b3d432f --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/environment.yaml @@ -0,0 +1,114 @@ +name: vidtok +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2024.11.26=h06a4308_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.15=h5eee18b_0 + - pip=24.2=py310h06a4308_0 + - python=3.10.15=he870216_1 + - readline=8.2=h5eee18b_0 + - setuptools=75.1.0=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h39e8969_0 + - wheel=0.44.0=py310h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.1.0 + - aiohappyeyeballs==2.4.4 + - aiohttp==3.11.9 + - aiosignal==1.3.1 + - antlr4-python3-runtime==4.9.3 + - appdirs==1.4.4 + - async-timeout==5.0.1 + - attrs==24.2.0 + - av==12.0.0 + - beartype==0.18.2 + - certifi==2024.8.30 + - charset-normalizer==3.4.0 + - click==8.1.7 + - contourpy==1.3.1 + - cycler==0.12.1 + - decord==0.6.0 + - docker-pycreds==0.4.0 + - einops==0.8.0 + - filelock==3.16.1 + - fonttools==4.55.1 + - frozenlist==1.5.0 + - fsspec==2024.10.0 + - gitdb==4.0.11 + - gitpython==3.1.43 + - grpcio==1.68.1 + - idna==3.10 + - imageio==2.34.0 + - jinja2==3.1.4 + - kiwisolver==1.4.7 + - lightning==2.2.4 + - lightning-utilities==0.11.9 + - markdown==3.7 + - markdown-it-py==3.0.0 + - markupsafe==3.0.2 + - matplotlib==3.8.4 + - mdurl==0.1.2 + - mpmath==1.3.0 + - multidict==6.1.0 + - natsort==8.4.0 + - networkx==3.4.2 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.19.3 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.1.105 + - omegaconf==2.3.0 + - opencv-python==4.6.0.66 + - packaging==24.2 + - pandas==2.1.4 + - pillow==11.0.0 + - propcache==0.2.1 + - protobuf==4.25.5 + - psutil==6.1.0 + - pygments==2.18.0 + - pyparsing==3.2.0 + - python-dateutil==2.9.0.post0 + - pytorch-lightning==2.2.4 + - pytz==2024.2 + - pyyaml==6.0.2 + - requests==2.32.3 + - rich==13.5.3 + - safetensors==0.4.2 + - sentry-sdk==2.19.0 + - setproctitle==1.3.4 + - six==1.17.0 + - smmap==5.0.1 + - sympy==1.13.3 + - tensorboard==2.16.2 + - tensorboard-data-server==0.7.2 + - torch==2.2.2 + - torchmetrics==1.6.0 + - torchvision==0.17.2 + - tqdm==4.67.1 + - triton==2.2.0 + - typing-extensions==4.12.2 + - tzdata==2024.2 + - urllib3==2.2.3 + - wandb==0.16.6 + - werkzeug==3.1.3 + - yarl==1.18.3 diff --git a/Meissonic/vidtok_cache/VidTok/main.py b/Meissonic/vidtok_cache/VidTok/main.py new file mode 100644 index 0000000000000000000000000000000000000000..e03f0de8a1c683fbdde7177083c87a0a3df85f83 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/main.py @@ -0,0 +1,1124 @@ +import argparse +import datetime +import pytz +import glob +import inspect +import os +import re +import sys +import numpy as np +import warnings +warnings.filterwarnings("ignore") +from rich import print +from inspect import Parameter +from typing import Union +from matplotlib import pyplot as plt +from natsort import natsorted +from omegaconf import OmegaConf +from packaging import version +from PIL import Image +from pathlib import Path + +import torch +import torch.distributed as dist +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch import seed_everything +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +from vidtok.modules.util import (exists, instantiate_from_config, isheatmap, + print0, seed_anything) + +MULTINODE_HACKS = True + + +def default_trainer_args(): + argspec = dict(inspect.signature(Trainer.__init__).parameters) + argspec.pop("self") + default_args = { + param: argspec[param].default + for param in argspec + if argspec[param] != Parameter.empty + } + return default_args + + +def get_step_value(folder_name): + match = re.search(r"step=(\d+)", folder_name) + if match: + return int(match.group(1)) + return 0 + + +def get_parser(**parser_kwargs): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser(**parser_kwargs) + parser.add_argument( + "-n", + "--name", + type=str, + const=True, + default="", + nargs="?", + help="postfix for logdir", + ) + parser.add_argument( + "--no_date", + type=str2bool, + nargs="?", + const=True, + default=False, + help="if True, skip date generation for logdir and only use naming via opt.base or opt.name (+ opt.postfix, optionally)", + ) + parser.add_argument( + "-r", + "--resume", + type=str, + const=True, + default="", + nargs="?", + help="resume from logdir or checkpoint in logdir", + ) + parser.add_argument( + "-b", + "--base", + nargs="*", + metavar="base_config.yaml", + help="paths to base configs. Loaded from left-to-right. " + "Parameters can be overwritten or added with command-line options of the form `--key value`.", + default=list(), + ) + parser.add_argument( + "-t", + "--train", + type=str2bool, + const=True, + default=True, + nargs="?", + help="train", + ) + parser.add_argument( + "--no-test", + type=str2bool, + const=True, + default=True, + nargs="?", + help="disable test", + ) + parser.add_argument( + "-p", "--project", help="name of new or path to existing project" + ) + parser.add_argument( + "-d", + "--debug", + type=str2bool, + nargs="?", + const=True, + default=False, + help="enable post-mortem debugging", + ) + parser.add_argument( + "-s", + "--seed", + type=int, + default=23, + help="seed for seed_everything", + ) + parser.add_argument( + "--seed_rank", + type=str2bool, + nargs="?", + const=True, + default=False, + help="reset seed every rank on fit start", + ) + parser.add_argument( + "-f", + "--postfix", + type=str, + default="", + help="post-postfix for default name", + ) + parser.add_argument( + "-l", + "--logdir", + type=str, + default="logs", + help="directory for logging dat shit", + ) + parser.add_argument( + "--scale_lr", + type=str2bool, + nargs="?", + const=True, + default=False, + help="scale base-lr by ngpu * batch_size * n_accumulate", + ) + parser.add_argument( + "--legacy_naming", + type=str2bool, + nargs="?", + const=True, + default=False, + help="name run based on config file name if true, else by whole path", + ) + parser.add_argument( + "--enable_tf32", + type=str2bool, + nargs="?", + const=True, + default=True, + help="enables the TensorFloat32 format both for matmuls and cuDNN for pytorch 1.12", + ) + parser.add_argument( + "--startup", + type=str, + default=None, + help="Startuptime from distributed script", + ) + parser.add_argument( + "--wandb", + type=str2bool, + nargs="?", + const=True, + default=False, + help="log to wandb", + ) + parser.add_argument( + "--wandb_entity", + type=str, + default="", + help="Wandb entity name string", + ) + parser.add_argument( + "--wandb_key", + type=str, + default="", + help="Wandb key", + ) + parser.add_argument( + "--wandb_project", + type=str, + default="vidtok", + ) + parser.add_argument( + "--wandb_id", + type=str, + default=None, + help="automatically resume from the same wandb id" + "must be used in combination with --wandb_auto_resume False", + ) + parser.add_argument( + "--wandb_auto_resume", + type=str2bool, + nargs="?", + const=True, + default=True, + help="will find the latest run id in the logdir" + "if checkpoint_auto_resume is False, wandb_auto_resume will be ignored", + ) + parser.add_argument( + "--checkpoint_auto_resume", + type=str2bool, + nargs="?", + const=True, + default=True, + help="will find the latest checkpoint in the logdir" + "if checkpoint_auto_resume is False, wandb_auto_resume will be ignored", + ) + parser.add_argument( + "--no_base_name", + type=str2bool, + nargs="?", + const=True, + default=False, # TODO: later default to True + help="log to wandb", + ) + if version.parse(torch.__version__) >= version.parse("2.0.0"): + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="single checkpoint file to resume from", + ) + default_args = default_trainer_args() + for key in default_args: + # parameters in the pl.Trainer are passed as --key value + parser.add_argument("--" + key, default=default_args[key]) + return parser + + +def get_checkpoint_name(logdir): + ckpt = os.path.join(logdir, "checkpoints", "last**.ckpt") + ckpt = natsorted(glob.glob(ckpt)) + print0('available "last" checkpoints:') + print0(ckpt) + if len(ckpt) > 1: + print0("got most recent checkpoint") + ckpt = sorted(ckpt, key=lambda x: os.path.getmtime(x))[-1] + print0(f"Most recent ckpt is {ckpt}") + with open(os.path.join(logdir, "most_recent_ckpt.txt"), "w") as f: + f.write(ckpt + "\n") + try: + version = int(ckpt.split("/")[-1].split("-v")[-1].split(".")[0]) + except Exception as e: + print0("version confusion but not bad") + print0(e) + version = 1 + # version = last_version + 1 + else: + # in this case, we only have one "last.ckpt" + ckpt = ckpt[0] + version = 1 + melk_ckpt_name = f"last-v{version}.ckpt" + print0(f"Current melk ckpt name: {melk_ckpt_name}") + return ckpt, melk_ckpt_name + + +class SetupCallback(Callback): + def __init__( + self, + resume, + now, + logdir, + ckptdir, + cfgdir, + config, + lightning_config, + debug, + save_ckpt_on_exception=False, + ckpt_name=None, + seed=None, + seed_rank=False, + ): + super().__init__() + self.resume = resume + self.now = now + self.logdir = logdir + self.ckptdir = ckptdir + self.cfgdir = cfgdir + self.config = config + self.lightning_config = lightning_config + self.debug = debug + self.save_ckpt_on_exception = save_ckpt_on_exception + self.ckpt_name = ckpt_name + self.seed = seed + self.seed_rank = seed_rank + + def on_exception(self, trainer: pl.Trainer, pl_module, exception): + if self.save_ckpt_on_exception and (not self.debug) and (trainer.global_rank == 0): + print0(f"[bold red]\[main][SetupCallback][/bold red] Saving checkpoint to {self.ckptdir}") + if self.ckpt_name is None: + ckpt_path = os.path.join(self.ckptdir, "last.ckpt") + else: + ckpt_path = os.path.join(self.ckptdir, self.ckpt_name) + trainer.save_checkpoint(ckpt_path) + + def on_fit_start(self, trainer, pl_module): + if self.seed_rank: + # current_seed = torch.initial_seed() + seed_anything(self.seed + trainer.global_rank) + print(f"[bold red]\[main][SetupCallback][/bold red] Rank {trainer.global_rank}: Reset GLOBAL seed to {self.seed + trainer.global_rank}") + elif hasattr(pl_module, "set_seed") and callable(pl_module.set_seed): + pl_module.set_seed(self.seed) + print0(f"[bold red]\[main][SetupCallback][/bold red] Set pl_module seed to {self.seed} with pl_module.set_seed") + if trainer.global_rank == 0: + # Create logdirs and save configs + print0(f"[bold red]\[main][SetupCallback][/bold red] Creating logdir: {self.logdir}, ckptdir: {self.ckptdir}, cfgdir: {self.cfgdir}") + os.makedirs(self.logdir, exist_ok=True) + os.makedirs(self.ckptdir, exist_ok=True) + os.makedirs(self.cfgdir, exist_ok=True) + + if "callbacks" in self.lightning_config: + if ( + "metrics_over_trainsteps_checkpoint" + in self.lightning_config["callbacks"] + ): + os.makedirs( + os.path.join(self.ckptdir, "trainstep_checkpoints"), + exist_ok=True, + ) + print0("[bold red]\[main][SetupCallback][/bold red] Project config") + print0(OmegaConf.to_yaml(self.config)) + if MULTINODE_HACKS and not self.debug: + import time + time.sleep(5) + OmegaConf.save( + self.config, + os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)), + ) + + print0("[bold red]\[main][SetupCallback][/bold red] Lightning config") + print0(OmegaConf.to_yaml(self.lightning_config)) + OmegaConf.save( + OmegaConf.create({"lightning": self.lightning_config}), + os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)), + ) + + else: + # ModelCheckpoint callback created log directory --- remove it + if not MULTINODE_HACKS and not self.resume and os.path.exists(self.logdir): + dst, name = os.path.split(self.logdir) + dst = os.path.join(dst, "child_runs", name) + os.makedirs(os.path.split(dst)[0], exist_ok=True) + try: + os.rename(self.logdir, dst) + except FileNotFoundError: + pass + + +class ImageLogger(Callback): + def __init__( + self, + batch_frequency, + max_samples, + clamp=True, + increase_log_steps=True, + rescale=True, + disabled=True, + log_on_batch_idx=False, + log_first_step=False, + log_images_kwargs=None, + log_before_first_step=False, + enable_autocast=True, + ): + super().__init__() + self.enable_autocast = enable_autocast + self.rescale = rescale + self.batch_freq = batch_frequency + self.max_samples = max_samples + self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)] + if not increase_log_steps: + self.log_steps = [self.batch_freq] + self.clamp = clamp + self.disabled = disabled + self.log_on_batch_idx = log_on_batch_idx + self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} + self.log_first_step = log_first_step + self.log_before_first_step = log_before_first_step + + @rank_zero_only + def log_local( + self, + save_dir, + split, + images, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "images", split) + for k in images: + if isheatmap(images[k]): + fig, ax = plt.subplots() + ax = ax.matshow( + images[k].cpu().numpy(), cmap="hot", interpolation="lanczos" + ) + plt.colorbar(ax) + plt.axis("off") + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + plt.savefig(path) + plt.close() + # TODO: support wandb + else: + grid = torchvision.utils.make_grid(images[k], nrow=4) + if self.rescale: + grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) + grid = grid.numpy() + grid = (grid * 255).astype(np.uint8) + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + path = os.path.join(root, filename) + os.makedirs(os.path.split(path)[0], exist_ok=True) + img = Image.fromarray(grid) + img.save(path) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "logger_log_image only supports WandbLogger currently" + pl_module.logger.log_image( + key=f"{split}/{k}", + images=[ + img, + ], + step=pl_module.global_step, + ) + + @rank_zero_only + def log_img(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + self.check_frequency(check_idx) + and hasattr(pl_module, "log_images") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_images) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + gpu_autocast_kwargs = { + "enabled": self.enable_autocast, # torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled(), + } + with torch.no_grad(), torch.cuda.amp.autocast(**gpu_autocast_kwargs): + images = pl_module.log_images( + batch, split=split, **self.log_images_kwargs + ) + + for k in images: + N = min(images[k].shape[0], self.max_samples) + if not isheatmap(images[k]): + images[k] = images[k][:N] + if isinstance(images[k], torch.Tensor): + images[k] = images[k].detach().float().cpu() + if self.clamp and not isheatmap(images[k]): + images[k] = torch.clamp(images[k], -1.0, 1.0) + + self.log_local( + pl_module.logger.save_dir, + split, + images, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + def check_frequency(self, check_idx): + if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( + check_idx > 0 or self.log_first_step + ): + try: + self.log_steps.pop(0) + except IndexError as e: + print0("[bold red]\[main][ImageLogger][/bold red]", e) + pass + return True + return False + + @rank_zero_only + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): + self.log_img(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + if self.log_before_first_step and pl_module.global_step == 0: + print0(f"[bold red]\[main][ImageLogger][/bold red] {self.__class__.__name__}: logging before training") + self.log_img(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs + ): + if not self.disabled and pl_module.global_step > 0: + self.log_img(pl_module, batch, batch_idx, split="val") + if hasattr(pl_module, "calibrate_grad_norm"): + if ( + pl_module.calibrate_grad_norm and batch_idx % 25 == 0 + ) and batch_idx > 0: + self.log_gradients(trainer, pl_module, batch_idx=batch_idx) + + +@rank_zero_only +def init_wandb(save_dir, opt, config, group_name, name_str): + print0(f"[bold red]\[main][init_wandb][/bold red] Creating WANDB_DIR: {save_dir}") + os.makedirs(save_dir, exist_ok=True) + + # os.environ["WANDB_DIR"] = save_dir + gitcmd = f'git config --global --add safe.directory {os.path.dirname(os.path.abspath(__file__))}' + os.system(gitcmd) + print0(f"[bold red]\[main][init_wandb][/bold red] wandb_id is set to {opt.wandb_id}") + wandb_id = opt.wandb_id if opt.wandb_id is not None else name_str + + if not wandb.api.api_key: + wandb.login(key=opt.wandb_key) + if opt.debug: + wandb.init(project=opt.wandb_project, mode="offline", group=group_name) + else: + wandb.init( + project=opt.wandb_project, + entity=opt.wandb_entity, + config=dict(config), + group=group_name, + name=name_str, + resume='auto', + id=wandb_id, + ) + + +if __name__ == "__main__": + # custom parser to specify config files, train, test and debug mode, + # postfix, resume. + # `--key value` arguments are interpreted as arguments to the trainer. + # `nested.key=value` arguments are interpreted as config parameters. + # configs are merged from left-to-right followed by command line parameters. + + # model: + # base_learning_rate: float + # target: path to lightning module + # params: + # key: value + # data: + # target: main.DataModuleFromConfig + # params: + # batch_size: int + # wrap: bool + # train: + # target: path to train dataset + # params: + # key: value + # validation: + # target: path to validation dataset + # params: + # key: value + # test: + # target: path to test dataset + # params: + # key: value + # lightning: (optional, has sane defaults and can be specified on cmdline) + # trainer: + # additional arguments to trainer + # logger: + # logger to instantiate + # modelcheckpoint: + # modelcheckpoint to instantiate + # callbacks: + # callback1: + # target: importpath + # params: + # key: value + + now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + + # add cwd for convenience and to make classes in this file available when + # running as `python main.py` + # (in particular `main.DataModuleFromConfig`) + sys.path.append(os.getcwd()) + + parser = get_parser() + + opt, unknown = parser.parse_known_args() + + if opt.name and opt.resume: + raise ValueError( + "-n/--name and -r/--resume cannot be specified both." + "If you want to resume training in a new log folder, " + "use -n/--name in combination with --resume_from_checkpoint" + ) + melk_ckpt_name = None + name = None + if opt.resume: + if not os.path.exists(opt.resume): + raise ValueError("Cannot find {}".format(opt.resume)) + if os.path.isfile(opt.resume): + paths = opt.resume.split("/") + # idx = len(paths)-paths[::-1].index("logs")+1 + # logdir = "/".join(paths[:idx]) + logdir = "/".join(paths[:-2]) + ckpt = opt.resume + _, melk_ckpt_name = get_checkpoint_name(logdir) + else: + assert os.path.isdir(opt.resume), opt.resume + logdir = opt.resume.rstrip("/") + ckpt, melk_ckpt_name = get_checkpoint_name(logdir) + + print0("-" * 80) + print0(f'[bold red][main][/bold red] Resuming from checkpoint "{ckpt}"') + + opt.resume_from_checkpoint = ckpt + base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml"))) + opt.base = base_configs + opt.base + _tmp = logdir.split("/") + nowname = _tmp[-1] + else: + if opt.name: + name = "_" + opt.name + elif opt.base: + if opt.no_base_name: + name = "" + else: + if opt.legacy_naming: + cfg_fname = os.path.split(opt.base[0])[-1] + cfg_name = os.path.splitext(cfg_fname)[0] + else: + assert "configs" in os.path.split(opt.base[0])[0], os.path.split( + opt.base[0] + )[0] + cfg_path = os.path.split(opt.base[0])[0].split(os.sep)[ + os.path.split(opt.base[0])[0].split(os.sep).index("configs") + + 1 : + ] # cut away the first one (we assert all configs are in "configs") + cfg_name = os.path.splitext(os.path.split(opt.base[0])[-1])[0] + cfg_name = "-".join(cfg_path) + f"-{cfg_name}" + name = "_" + cfg_name + else: + name = "" + # automatic resume last checkpoint if available + if os.path.exists(opt.logdir): + auto_resumed = False + for sub_dir in sorted(os.listdir(opt.logdir)): + if sub_dir.endswith(name + opt.postfix): + ## checkpoint resume + if opt.checkpoint_auto_resume and not opt.debug: + checkpoint_dir = os.path.join(opt.logdir, sub_dir, "checkpoints") + # Use the max step checkpoint file + ckpt_files1 = glob.glob(os.path.join(checkpoint_dir, "*/*.ckpt")) + ckpt_files2 = glob.glob(os.path.join(checkpoint_dir, "*.ckpt")) + ckpt_files = ckpt_files1 + ckpt_files2 + ckpt_files.sort(key=get_step_value, reverse=True) + if ckpt_files: + ckpt = ckpt_files[0] + else: + # If no checkpoint files found, use a random initialized model + ckpt = None + if ckpt is not None and os.path.isfile(ckpt): + opt.resume_from_checkpoint = ckpt + auto_resumed = True + # print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Find previous log dir and checkpoint: {ckpt}") + ## wandb resume + if opt.wandb_auto_resume: + wandb_dir = Path(os.path.join(opt.logdir, sub_dir)) / "wandb" + if wandb_dir.exists() and any((wandb_dir / "latest-run").iterdir()): + # Parse unique `run_id` from the `.wandb.` file... + wandb_fns = [f.name for f in (wandb_dir / "latest-run").iterdir() if f.name.endswith(".wandb")] + assert len(wandb_fns) == 1, f"There should only be 1 `.wandb.` file... found {len(wandb_fns)}!" + # Regex Match on `run-{id}.wandb` + opt.wandb_id = re.search("run-(.+?).wandb", wandb_fns[0]).group(1) + # print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Find previous wandb run id: {opt.wandb_id}") + if auto_resumed: + print0(f"[bold red]\[main][/bold red] Auto-resuming from checkpoint: {opt.resume_from_checkpoint} and wandb id: {opt.wandb_id}") + ckpt_basename = os.path.basename(opt.resume_from_checkpoint) + seed_str = ''.join(re.findall(r'\d+', ckpt_basename)) + if len(seed_str) > 0: + opt.seed = int(seed_str) + print0(f"[bold red]\[main][/bold red] Auto-reseting seed to {opt.seed} from checkpoint name") + + if not opt.no_date: + nowname = now + name + opt.postfix + else: + nowname = name + opt.postfix + if nowname.startswith("_"): + nowname = nowname[1:] + logdir = os.path.join(opt.logdir, nowname) + print0(f"[bold red]\[main][/bold red] LOGDIR: {logdir}") + + ckptdir = os.path.join(logdir, "checkpoints") + cfgdir = os.path.join(logdir, "configs") + if not opt.seed_rank: + seed_everything(opt.seed, workers=True) # torch.initial_seed() + + # move before model init, in case a torch.compile(...) is called somewhere + if opt.enable_tf32: + # pt_version = version.parse(torch.__version__) + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + print0(f"[bold red]\[main][/bold red] Enabling TF32 for PyTorch {torch.__version__}") + else: + print0(f"[bold red]\[main][/bold red] Using default TF32 settings for PyTorch {torch.__version__}:") + print0(f"[bold red]\[main][/bold red] torch.backends.cuda.matmul.allow_tf32={torch.backends.cuda.matmul.allow_tf32}") + print0(f"[bold red]\[main][/bold red] torch.backends.cudnn.allow_tf32={torch.backends.cudnn.allow_tf32}") + + try: + # init and save configs + configs = [OmegaConf.load(cfg) for cfg in opt.base] + # deal with the unknown args, e.g., --model.base_learning_rate=1.0e-4 + for i, u in enumerate(unknown): + if u.startswith("--"): + unknown[i] = u[2:] + # merge all configs and cli args + cli = OmegaConf.from_dotlist(unknown) + config = OmegaConf.merge(*configs, cli) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Merged input config: {config}") + lightning_config = config.pop("lightning", OmegaConf.create()) + # merge trainer cli with config + trainer_config = lightning_config.get("trainer", OmegaConf.create()) + + # debug: default to one node + if opt.debug: + trainer_config["num_nodes"] = 1 + + # default profiler + trainer_config["profiler"] = None if not opt.debug else "simple" + + # default to gpu + trainer_config["accelerator"] = "gpu" + # + standard_args = default_trainer_args() + for k in standard_args: + if getattr(opt, k) != standard_args[k]: + trainer_config[k] = getattr(opt, k) + + if not "devices" in trainer_config and trainer_config["accelerator"] != "gpu": + del trainer_config["accelerator"] + cpu = True + else: + gpuinfo = trainer_config["devices"] + print0(f"[bold red]\[main][/bold red] Running on {gpuinfo} GPUs") + cpu = False + trainer_opt = argparse.Namespace(**trainer_config) + lightning_config.trainer = trainer_config + + # model + model = instantiate_from_config(config.model) + + # trainer and callbacks + trainer_kwargs = dict() + + # default logger configs + default_logger_cfgs = { + "wandb": { + "target": "lightning.pytorch.loggers.WandbLogger", + "params": { + "name": nowname, + "save_dir": logdir, + "offline": opt.debug, + "id": nowname, + "project": opt.wandb_project, + "log_model": False, + "entity": opt.wandb_entity, + }, + }, + "csv": { + "target": "lightning.pytorch.loggers.CSVLogger", + "params": { + "name": "testtube", # hack for sbord fanatics + "save_dir": logdir, + }, + }, + "tensorboard": { + "target": "lightning.pytorch.loggers.TensorBoardLogger", + "params": { + "save_dir": logdir, + "name": 'tensorboard', + "version": nowname, + } + }, + } + default_logger_cfg = default_logger_cfgs["wandb" if opt.wandb else "tensorboard"] + if opt.wandb: + # change once leaving "swiffer" config directory + try: + group_name = nowname.split(now)[-1].split("-")[1] + except: + group_name = nowname + default_logger_cfg["params"]["group"] = group_name + + wandb_save_dir = os.path.join(os.getcwd(), logdir) + os.environ["WANDB_DIR"] = wandb_save_dir + + init_wandb( + wandb_save_dir, + opt=opt, + group_name=group_name, + config=config, + name_str=nowname, + ) + if "logger" in lightning_config: + logger_cfg = lightning_config.logger + else: + logger_cfg = OmegaConf.create() + logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg) + trainer_kwargs["logger"] = instantiate_from_config(logger_cfg) + + ckpt_resume_path = opt.resume_from_checkpoint + + # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to + # specify which metric is used to determine best models + default_modelckpt_cfg = { + "target": "lightning.pytorch.callbacks.ModelCheckpoint", + "params": { + "dirpath": ckptdir, + "filename": "{epoch:04}-{step:08}", # "epoch={epoch:06}-step={step:07}" + "verbose": True, + "save_last": True, + "auto_insert_metric_name": True, + }, + } + if hasattr(model, "monitor"): + print0(f"[bold red]\[main][/bold red] Monitoring {model.monitor} as checkpoint metric.") + default_modelckpt_cfg["params"]["monitor"] = model.monitor + default_modelckpt_cfg["params"]["save_top_k"] = 3 + + if "modelcheckpoint" in lightning_config: + modelckpt_cfg = lightning_config.modelcheckpoint + else: + modelckpt_cfg = OmegaConf.create() + modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] Merged modelckpt-cfg: {modelckpt_cfg}") + + # https://pytorch-lightning.readthedocs.io/en/stable/extensions/strategy.html + # default to ddp if not further specified + default_strategy_config = {"target": "lightning.pytorch.strategies.DDPStrategy"} + + if "strategy" in lightning_config: + strategy_cfg = lightning_config.strategy + else: + strategy_cfg = OmegaConf.create() + default_strategy_config["params"] = { + "find_unused_parameters": False, + # "static_graph": True, + # "ddp_comm_hook": default.fp16_compress_hook # experiment with this, also for DDPSharded + } + strategy_cfg = OmegaConf.merge(default_strategy_config, strategy_cfg) + print0("-" * 80) + print0(f"[bold red]\[main][/bold red] strategy config: {strategy_cfg}") + trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg) + if hasattr(trainer_kwargs["strategy"], "_timeout"): + trainer_kwargs["strategy"]._timeout = datetime.timedelta(seconds=5400) # 3600s = 1h + + # add callback which sets up log directory + default_callbacks_cfg = { + "setup_callback": { + "target": "main.SetupCallback", + "params": { + "resume": opt.resume, + "now": now, + "logdir": logdir, + "ckptdir": ckptdir, + "cfgdir": cfgdir, + "config": config, + "lightning_config": lightning_config, + "debug": opt.debug, + "ckpt_name": melk_ckpt_name, + "seed": opt.seed, + "seed_rank": opt.seed_rank + }, + }, + "image_logger": { + "target": "main.ImageLogger", + "params": {"batch_frequency": 1000, "max_samples": 4, "clamp": True}, + }, + "learning_rate_logger": { + "target": "lightning.pytorch.callbacks.LearningRateMonitor", + "params": { + "logging_interval": "step", + # "log_momentum": True + }, + }, + } + if version.parse(pl.__version__) >= version.parse("1.4.0"): + default_callbacks_cfg.update({"checkpoint_callback": modelckpt_cfg}) + + if "callbacks" in lightning_config: + callbacks_cfg = lightning_config.callbacks + else: + callbacks_cfg = OmegaConf.create() + + if "metrics_over_trainsteps_checkpoint" in callbacks_cfg: + print0( + "[bold red]\[main][/bold red] Caution: Saving checkpoints every n train steps without deleting. This might require some free space." + ) + default_metrics_over_trainsteps_ckpt_dict = { + "metrics_over_trainsteps_checkpoint": { + "target": "lightning.pytorch.callbacks.ModelCheckpoint", + "params": { + "dirpath": os.path.join(ckptdir, "trainstep_checkpoints"), + "filename": "{epoch:04}-{step:08}", # "{epoch:06}-{step:09}" + "verbose": True, + "save_top_k": -1, + "every_n_train_steps": 10000, + "save_weights_only": True, + }, + } + } + default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict) + + callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg) + if "ignore_keys_callback" in callbacks_cfg and ckpt_resume_path is not None: + callbacks_cfg.ignore_keys_callback.params["ckpt_path"] = ckpt_resume_path + elif "ignore_keys_callback" in callbacks_cfg: + del callbacks_cfg["ignore_keys_callback"] + + trainer_kwargs["callbacks"] = [ + instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg + ] + if not "plugins" in trainer_kwargs: + trainer_kwargs["plugins"] = list() + + # cmd line trainer args (which are in trainer_opt) have always priority over config-trainer-args (which are in trainer_kwargs) + trainer_opt = vars(trainer_opt) + trainer_kwargs = { + key: val for key, val in trainer_kwargs.items() if key not in trainer_opt + } + trainer = Trainer(**trainer_opt, **trainer_kwargs) + + trainer.logdir = logdir + + # data + if ((not opt.train) or opt.debug) and hasattr(config.data.params, "validation"): + config.data.params.train = config.data.params.validation + print0("[bold red]\[main][/bold red] Using validation data as training data for fast loading.") + data = instantiate_from_config(config.data) + # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + # calling these ourselves should not be necessary but it is. + # lightning still takes care of proper multiprocessing though + data.prepare_data() + # data.setup() + try: + for k in data.datasets: + print0( + f"[bold red]\[main][/bold red] {k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}" + ) + except: + print0("[bold red]\[main][/bold red] datasets not yet initialized.") + + # configure learning rate + if "batch_size" in config.data.params: + bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate + else: + bs, base_lr = ( + config.data.params.train.loader.batch_size, + config.model.base_learning_rate, + ) + if not cpu: + # add for different device input type + if isinstance(lightning_config.trainer.devices, int): + ngpu = lightning_config.trainer.devices + elif isinstance(lightning_config.trainer.devices, list): + ngpu = len(lightning_config.trainer.devices) + elif isinstance(lightning_config.trainer.devices, str): + ngpu = len(lightning_config.trainer.devices.strip(",").split(",")) + else: + ngpu = 1 + if "accumulate_grad_batches" in lightning_config.trainer: + accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches + else: + accumulate_grad_batches = 1 + print0(f"[bold red]\[main][/bold red] accumulate_grad_batches = {accumulate_grad_batches}") + lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches + + if opt.scale_lr: + model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr + print0( + "[bold red]\[main][/bold red] Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format( + model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr + ) + ) + else: + model.learning_rate = base_lr + print0("[bold red]\[main][/bold red] NOT using learning rate scaling") + print0(f"[bold red]\[main][/bold red] Setting learning rate to {model.learning_rate:.2e}") + + # allow checkpointing via USR1 + def melk(*args, **kwargs): + # run all checkpoint hooks + if trainer.global_rank == 0: + melkdir = os.path.join(logdir, "melk") + os.makedirs(melkdir, exist_ok=True) + print0(f"[bold red]\[main][/bold red] Saving checkpoint to {melkdir}") + if melk_ckpt_name is None: + ckpt_path = os.path.join(melkdir, "last.ckpt") + else: + ckpt_path = os.path.join(melkdir, melk_ckpt_name) + trainer.save_checkpoint(ckpt_path) + + def divein(*args, **kwargs): + if trainer.global_rank == 0: + import pudb + pudb.set_trace() + + import signal + signal.signal(signal.SIGUSR1, melk) + signal.signal(signal.SIGUSR2, divein) + + # run + if opt.train: + try: + trainer.fit(model, data, ckpt_path=ckpt_resume_path) + print0(f"[bold red]\[main][/bold red] Finish training with logdir: {logdir}") + except Exception as e: + print(f"") + print(f"[bold red]\[main][/bold red] Exception: {e}") + print(f"[bold red]\[main][/bold red] Beijing Time {datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai'))}") + if not opt.debug: + melk() + raise + else: + trainer.validate(model, data, ckpt_path=ckpt_resume_path) + exit() + if not opt.no_test and not trainer.interrupted: + trainer.test(model, data) + except RuntimeError as err: + if MULTINODE_HACKS: + import datetime + import os + import socket + import requests + + device = os.environ.get("CUDA_VISIBLE_DEVICES", "?") + hostname = socket.gethostname() + ts = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + resp = requests.get("http://169.254.169.254/latest/meta-data/instance-id") + print( + f"[bold red]\[main][/bold red] ERROR at {ts} on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}", + flush=True, + ) + raise err + except Exception: + if opt.debug and trainer.global_rank == 0: + try: + import pudb as debugger + except ImportError: + import pdb as debugger + # debugger.post_mortem() + raise + finally: + # move newly created debug project to debug_runs + if opt.debug and not opt.resume and trainer.global_rank == 0: + dst, name = os.path.split(logdir) + dst = os.path.join(dst, "debug_runs", name) + os.makedirs(os.path.split(dst)[0], exist_ok=True) + os.rename(logdir, dst) + + if opt.wandb: + wandb.finish() + + # clean up + # dist.barrier() + # torch.cuda.empty_cache() + dist.destroy_process_group() + + if trainer.global_rank == 0 and opt.debug: + print0(f"[bold red]\[main][/bold red] Current logdir: {logdir}") + # print0(f"[bold red]\[main][/bold red] Profiler summary:") + # print(trainer.profiler.summary()) + print0(f"[bold red]\[main][/bold red] Memory summary:") + num_params = sum([p.numel() for p in model.parameters()]) + print0(f"[bold red]\[main][/bold red] Expected bf16 memory usage from params: {num_params * 2 / 1e9:.2f} GB") + print0(f"[bold red]\[main][/bold red] Current memory usage with model on device {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") + # trainer.print(torch.cuda.memory_summary()) diff --git a/Meissonic/vidtok_cache/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc b/Meissonic/vidtok_cache/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3073fc0399093458876a613ed5102383d038820 Binary files /dev/null and b/Meissonic/vidtok_cache/VidTok/scripts/__pycache__/inference_evaluate.cpython-310.pyc differ diff --git a/Meissonic/vidtok_cache/VidTok/scripts/inference_evaluate.py b/Meissonic/vidtok_cache/VidTok/scripts/inference_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..d4152046727305af7b32e0a3e5728072b7085ca4 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/scripts/inference_evaluate.py @@ -0,0 +1,198 @@ +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import warnings +warnings.filterwarnings("ignore") + +import time +from contextlib import nullcontext +from omegaconf import OmegaConf +from torch import autocast +from tqdm import tqdm + +import numpy as np +import torch +from einops import rearrange +from lightning.pytorch import seed_everything + +from vidtok.data.vidtok import VidTokValDataset +from vidtok.modules.lpips import LPIPS +from vidtok.modules.util import (compute_psnr, compute_ssim, + instantiate_from_config, print0) + + +def load_model_from_config(config, ckpt, ignore_keys=[], verbose=False): + config = OmegaConf.load(config) + config.model.params.ckpt_path = ckpt + config.model.params.ignore_keys = ignore_keys + config.model.params.verbose = verbose + model = instantiate_from_config(config.model) + return model + + +class MultiVideoDataset(VidTokValDataset): + def __init__( + self, + data_dir, + meta_path=None, + input_height=256, + input_width=256, + sample_fps=30, + chunk_size=16, + is_causal=True, + read_long_video=False + ): + super().__init__( + data_dir=data_dir, + meta_path=meta_path, + video_params={ + "input_height": input_height, + "input_width": input_width, + "sample_num_frames": chunk_size + 1 if is_causal else chunk_size, + "sample_fps": sample_fps, + }, + pre_load_frames=True, + last_frames_handle="repeat", + read_long_video=read_long_video, + chunk_size=chunk_size, + is_causal=is_causal, + ) + + def __getitem__(self, idx): + frames = super().__getitem__(idx)["jpg"] + return frames + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--data_dir", + type=str, + default="./", + help="root folder", + ) + parser.add_argument( + "--meta_path", + type=str, + default=None, + help="path to the .csv meta file", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=16, + help="the size of a chunk - we split a long video into several chunks", + ) + parser.add_argument( + "--read_long_video", + action='store_true' + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[scripts.inference_evaluate][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + assert args.chunk_size % model.encoder.time_downsample_factor == 0 + + + if args.read_long_video: + assert hasattr(model, 'use_tiling'), "Tiling inference is needed to conduct long video reconstruction." + print(f"Using tiling inference to save memory usage...") + model.enable_tiling() + model.t_chunk_enc = args.chunk_size + model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor + + if args.input_width > 256: + model.enable_tiling() + + dataset = MultiVideoDataset( + data_dir=args.data_dir, + meta_path=args.meta_path, + input_height=args.input_height, + input_width=args.input_width, + sample_fps=args.sample_fps, + chunk_size=args.chunk_size, + is_causal=model.is_causal, + read_long_video=args.read_long_video + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + perceptual_loss = LPIPS().eval() + perceptual_loss = perceptual_loss.to(device) + + psnrs, ssims, lpipss = [], [], [] + + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, output, reg_log = model(input) + output = output.clamp(-1, 1) + input, output = map(lambda x: (x + 1) / 2, (input, output)) + + if input.dim() == 5: + input = rearrange(input, "b c t h w -> (b t) c h w") + assert output.dim() == 5 + output = rearrange(output, "b c t h w -> (b t) c h w") + + for inp, out in zip(torch.split(input, 16), torch.split(output, 16)): + psnrs += [compute_psnr(inp, out).item()] * inp.shape[0] + ssims += [compute_ssim(inp, out).item()] * inp.shape[0] + lpipss += [perceptual_loss(inp * 2 - 1, out * 2 - 1).mean().item()] * inp.shape[0] + + toc = time.time() + print0( + f"[bold red]\[scripts.inference_evaluate][/bold red] PSNR: {np.mean(psnrs):.4f}, SSIM: {np.mean(ssims):.4f}, LPIPS: {np.mean(lpipss):.4f}" + ) + print0(f"[bold red]\[scripts.inference_evaluate][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/vidtok_cache/VidTok/scripts/inference_reconstruct.py b/Meissonic/vidtok_cache/VidTok/scripts/inference_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..3a26b9475339f6675e01fd052637c8465ca37caf --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/scripts/inference_reconstruct.py @@ -0,0 +1,246 @@ +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import warnings +warnings.filterwarnings("ignore") + +import time +from contextlib import nullcontext +from omegaconf import OmegaConf +from pathlib import Path +from tqdm import tqdm + +import numpy as np +import torch +import decord +from einops import rearrange +from lightning.pytorch import seed_everything +from torch import autocast +from torchvision import transforms +from torchvision.io import write_video + +from vidtok.modules.util import print0 +from scripts.inference_evaluate import load_model_from_config + + +class SingleVideoDataset(torch.utils.data.Dataset): + def __init__( + self, + video_path, + input_height=128, + input_width=128, + sample_fps=8, + chunk_size=16, + is_causal=True, + read_long_video=False + ): + decord.bridge.set_bridge("torch") + self.video_path = video_path + self.transform = transforms.Compose( + [ + transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + ] + ) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + if read_long_video: + video_length = len(frame_ids) + if is_causal and video_length > chunk_size: + self.frame_ids_batch.append(frame_ids[:chunk_size * ((video_length - 1) // chunk_size) + 1]) + elif not is_causal and video_length >= chunk_size: + self.frame_ids_batch.append(frame_ids[:chunk_size * (video_length // chunk_size)]) + else: + num_frames_per_batch = chunk_size + 1 if is_causal else chunk_size + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x : x + num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x : x + num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255.0 + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=16, + help="the size of a chunk - we split a long video into several chunks", + ) + parser.add_argument( + "--read_long_video", + action='store_true' + ) + parser.add_argument( + "--pad_gen_frames", + action="store_true", + help="Used only in causal mode. If True, pad frames generated in the last batch, else replicate the first frame instead", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[scripts.inference_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + config = OmegaConf.load(args.config) + + os.makedirs(args.output_video_dir, exist_ok=True) + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + assert args.chunk_size % model.encoder.time_downsample_factor == 0 + + if args.read_long_video: + assert hasattr(model, 'use_tiling'), "Tiling inference is needed to conduct long video reconstruction." + print(f"Using tiling inference to save memory usage...") + model.use_tiling = True + model.t_chunk_enc = args.chunk_size + model.t_chunk_dec = model.t_chunk_enc // model.encoder.time_downsample_factor + model.use_overlap = True + + dataset = SingleVideoDataset( + video_path=args.input_video_path, + input_height=args.input_height, + input_width=args.input_width, + sample_fps=args.sample_fps, + chunk_size=args.chunk_size, + is_causal=model.is_causal, + read_long_video=args.read_long_video + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + inputs = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + + if model.is_causal and not args.read_long_video and args.pad_gen_frames: + if i == 0: + _, xrec, _ = model(input) + else: + _, xrec, _ = model(torch.cat([last_gen_frames, input], dim=2)) + xrec = xrec[:, :, -input.shape[2]:].clamp(-1, 1) + last_gen_frames = xrec[:, :, (1 - model.encoder.time_downsample_factor):, :, :] + else: + _, xrec, _ = model(input) + + input = rearrange(input, "b c t h w -> (b t) c h w") + inputs.append(input) + xrec = rearrange(xrec.clamp(-1, 1), "b c t h w -> (b t) c h w") + outputs.append(xrec) + + toc = time.time() + + # save the outputs as videos + inputs = tensor_to_uint8(torch.cat(inputs, dim=0)) + inputs = rearrange(inputs, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs.shape[0], outputs.shape[0]) + final = np.concatenate([inputs[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"{Path(args.input_video_path).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + + print0(f"[bold red]Results saved in: {output_video_path}[/bold red]") + print0(f"[bold red]\[scripts.inference_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc b/Meissonic/vidtok_cache/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a60105308bbef7ac0e229301aaf664728e0dba28 Binary files /dev/null and b/Meissonic/vidtok_cache/VidTok/vidtok/data/__pycache__/vidtok.cpython-310.pyc differ diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/data/datamodule.py b/Meissonic/vidtok_cache/VidTok/vidtok/data/datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..c405b84703735b9a076b9ffb4e74de673470c15b --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/data/datamodule.py @@ -0,0 +1,150 @@ +import numpy as np +from functools import partial + +import torch +import lightning.pytorch as pl +from torch.utils.data import DataLoader, Dataset, IterableDataset + +from vidtok.modules.util import instantiate_from_config + + +class WrappedDataset(Dataset): + """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset""" + + def __init__(self, dataset): + self.data = dataset + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + +def worker_init_fn(_): + worker_info = torch.utils.data.get_worker_info() + + dataset = worker_info.dataset + worker_id = worker_info.id + + if isinstance(dataset, IterableDataset): + split_size = dataset.num_records // worker_info.num_workers + # reset num_records to the true number to retain reliable length information + dataset.sample_ids = dataset.valid_ids[ + worker_id * split_size : (worker_id + 1) * split_size + ] + current_id = np.random.choice(len(np.random.get_state()[1]), 1) + return np.random.seed(np.random.get_state()[1][current_id] + worker_id) + else: + return np.random.seed(np.random.get_state()[1][0] + worker_id) + + +class DataModuleFromConfig(pl.LightningDataModule): + def __init__( + self, + batch_size, + train=None, + validation=None, + test=None, + predict=None, + wrap=False, + num_workers=None, + pin_train_memory=True, + is_iterable_dataset=False, + shuffle_test_loader=False, + use_worker_init_fn=False, + shuffle_val_dataloader=False, + ): + super().__init__() + self.batch_size = batch_size + self.dataset_configs = dict() + self.num_workers = num_workers if num_workers is not None else batch_size * 2 + self.pin_train_memory = pin_train_memory + self.is_iterable_dataset = is_iterable_dataset + self.use_worker_init_fn = use_worker_init_fn + if train is not None: + self.dataset_configs["train"] = train + self.train_dataloader = self._train_dataloader + if validation is not None: + self.dataset_configs["validation"] = validation + self.val_dataloader = partial( + self._val_dataloader, shuffle=shuffle_val_dataloader + ) + if test is not None: + self.dataset_configs["test"] = test + self.test_dataloader = partial( + self._test_dataloader, shuffle=shuffle_test_loader + ) + if predict is not None: + self.dataset_configs["predict"] = predict + self.predict_dataloader = self._predict_dataloader + self.wrap = wrap + + def prepare_data(self): + for data_cfg in self.dataset_configs.values(): + instantiate_from_config(data_cfg) + + def setup(self, stage=None): + self.datasets = dict( + (k, instantiate_from_config(self.dataset_configs[k])) + for k in self.dataset_configs + ) + if self.wrap: + for k in self.datasets: + self.datasets[k] = WrappedDataset(self.datasets[k]) + + def _train_dataloader(self): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["train"], + batch_size=self.batch_size, + num_workers=self.num_workers, + pin_memory=self.pin_train_memory, + shuffle=False if self.is_iterable_dataset else True, + worker_init_fn=init_fn, + ) + + def _val_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["validation"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + shuffle=shuffle, + ) + + def _test_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + + # do not shuffle dataloader for iterable dataset + shuffle = shuffle and (not self.is_iterable_dataset) + + return DataLoader( + self.datasets["test"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + shuffle=shuffle, + ) + + def _predict_dataloader(self, shuffle=False): + if self.is_iterable_dataset or self.use_worker_init_fn: + init_fn = worker_init_fn + else: + init_fn = None + return DataLoader( + self.datasets["predict"], + batch_size=self.batch_size, + num_workers=self.num_workers, + worker_init_fn=init_fn, + ) diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/data/video_read.py b/Meissonic/vidtok_cache/VidTok/vidtok/data/video_read.py new file mode 100644 index 0000000000000000000000000000000000000000..357cd48305141ee70924fd6a79adbe5cb7f6ca5b --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/data/video_read.py @@ -0,0 +1,88 @@ +import os +import random +import decord +import numpy as np +import torch + +from vidtok.modules.util import print0 + +decord.bridge.set_bridge("torch") + + +def sample_frames_with_fps( + total_frames, + video_fps, + sample_num_frames, + sample_fps, + start_index=None +): + """sample frames proportional to the length of the frames in one second + e.g., 1s video has 30 frames, when 'fps'=3, we sample frames with spacing of 30/3=10 + return the frame indices + + Parameters + ---------- + total_frames : length of the video + video_fps : original fps of the video + sample_num_frames : number of frames to sample + sample_fps : the fps to sample frames + start_index : the starting frame index. If it is not None, it will be used as the starting frame index + + Returns + ------- + frame indices + """ + sample_num_frames = min(sample_num_frames, total_frames) + interval = round(video_fps / sample_fps) + frames_range = (sample_num_frames - 1) * interval + 1 + + if start_index is not None: + start = start_index + elif total_frames - frames_range - 1 < 0: + start = 0 + else: + start = random.randint(0, total_frames - frames_range - 1) + + frame_idxs = np.linspace( + start=start, stop=min(total_frames - 1, start + frames_range), num=sample_num_frames + ).astype(int) + + return frame_idxs + + +def read_frames_with_decord( + video_path, + sample_num_frames, + sample_fps, + start_index=None +) -> tuple[torch.Tensor, list[int]]: + """read frames from video path using decord + + Parameters + ---------- + video_path : path to video + sample_num_frames : number of frames to sample + sample_fps : the fps to sample frames + start_index : the starting frame index. If it is not None, it will be used as the starting frame index + + Returns + ------- + frames (tensor 0~1), frame indices + """ + video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(video_reader) + video_fps = video_reader.get_avg_fps() # note that the fps here is float. + frame_idxs = sample_frames_with_fps( + total_frames=total_frames, + video_fps=video_fps, + sample_num_frames=sample_num_frames, + sample_fps=sample_fps, + start_index=start_index + ) + frames = video_reader.get_batch(frame_idxs) + frames = frames.float() / 255 + frames = frames.permute(0, 3, 1, 2) + if (frames.shape[0] != sample_num_frames) or (len(frame_idxs) != sample_num_frames): + print0(f"[bold yellow]\[vidtok.data.video_read][read_frames_with_decord][/bold yellow] Warning: need {sample_num_frames} frames, " + f"but got {frames.shape[0]} frames, {len(frame_idxs)} frame indices, video_path={video_path}.") + return frames, frame_idxs diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/data/vidtok.py b/Meissonic/vidtok_cache/VidTok/vidtok/data/vidtok.py new file mode 100644 index 0000000000000000000000000000000000000000..4b1898d4718de1450da96f25e9134dd4e1084cc1 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/data/vidtok.py @@ -0,0 +1,333 @@ +import os +import glob +from typing import Union + +import decord +import numpy as np +import pandas as pd +import torch +from PIL import Image +from torch.utils.data import Dataset +from torchvision.transforms import v2 +from tqdm import trange + +from vidtok.modules.util import print0 +from .video_read import read_frames_with_decord + + +class VidTokDataset(Dataset): + def __init__( + self, + data_dir: str, + meta_path: str, + video_params: dict, + data_frac: float = 1.0, + is_strict_loading: bool = False, + skip_missing_files: bool = True, + start_index: Union[None, int] = None + ): + super().__init__() + + self.data_dir = data_dir + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Use data dir: {self.data_dir}") + + self.meta_path = meta_path + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Use meta path: {self.meta_path}") + + self.video_params = video_params + + self.data_frac = data_frac + self.is_strict_loading = is_strict_loading + self.skip_missing_files = skip_missing_files + self.start_index = start_index + self.transforms = self._get_transforms( + video_params["input_height"], + video_params["input_width"], + ) + + self.missing_files = [] + self._load_metadata() + + def _get_transforms(self, input_height, input_width, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]): + normalize = v2.Normalize(mean=norm_mean, std=norm_std) + return v2.Compose( + [ + v2.Resize(input_height, antialias=True), + v2.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + def _load_metadata(self): + metadata = pd.read_csv( + self.meta_path, + on_bad_lines="skip", + encoding="ISO-8859-1", + engine="python", + sep=",",) + + if self.data_frac < 1: + metadata = metadata.sample(frac=self.data_frac) + self.metadata = metadata + self.metadata.dropna(inplace=True) + + def _get_video_path(self, sample): + """reduce the access to the disk + """ + rel_video_fp = str(sample["videos"]) + abs_video_fp = os.path.join(self.data_dir, rel_video_fp) + return abs_video_fp, rel_video_fp + + def __len__(self): + return len(self.metadata) + + def __getitem__(self, item): + item = item % len(self.metadata) + sample = self.metadata.iloc[item] + video_fp, _ = self._get_video_path(sample) + + try: + if os.path.isfile(video_fp): + imgs, idxs = read_frames_with_decord( + video_path=video_fp, + sample_num_frames=self.video_params["sample_num_frames"], + sample_fps=self.video_params["sample_fps"], + start_index=self.start_index + ) + else: + # if the video file is missing + if video_fp not in self.missing_files: + self.missing_files.append(video_fp) + # resample another video or not + if self.skip_missing_files: + print0(f"[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Warning: missing video file {video_fp}. Resampling another video.") + return self.__getitem__(np.random.choice(self.__len__())) + else: + raise ValueError(f"Video file {video_fp} is missing, skip_missing_files={self.skip_missing_files}.") + except Exception as e: + # if the video exists, but loading failed + if self.is_strict_loading: + raise ValueError(f"Video loading failed for {video_fp}, is_strict_loading={self.is_strict_loading}.") from e + else: + print0("[bold yellow]\[vidtok.data.vidtok][VidTokDataset][/bold yellow] Warning: using the pure black image as the frame sample") + imgs = Image.new("RGB", (self.video_params["input_width"], self.video_params["input_height"]), (0, 0, 0)) + imgs = v2.ToTensor()(imgs).unsqueeze(0) + + if self.transforms is not None: + # imgs: (T, C, H, W) + imgs = self.transforms(imgs) + + if imgs.shape[0] < self.video_params["sample_num_frames"]: + imgs = torch.cat([imgs, imgs[-1].unsqueeze(0).repeat(self.video_params["sample_num_frames"] - imgs.shape[0], 1, 1, 1)], dim=0) + + imgs = imgs.permute(1, 0, 2, 3) # (C, T, H, W) + + return { + 'jpg': imgs, + "path": video_fp + } + + +class VidTokValDataset(Dataset): + def __init__( + self, + data_dir: str, + video_params: dict, + meta_path: Union[None, str] = None, + pre_load_frames: bool = True, + is_strict_loading: bool = True, + last_frames_handle: str = "repeat", # 'repeat', 'drop' + skip_missing_files: bool = False, + read_long_video: bool = False, + chunk_size: int = 16, + is_causal: bool = True, + ): + super().__init__() + + self.data_dir = data_dir + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Use data dir: {self.data_dir}" + ) + + self.meta_path = meta_path + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Use meta path: {self.meta_path}" + ) + + self.video_params = video_params + self.read_long_video = read_long_video + self.chunk_size = chunk_size + self.is_causal = is_causal + + self.is_strict_loading = is_strict_loading + self.last_frames_handle = last_frames_handle + self.skip_missing_files = skip_missing_files + self.transforms = self._get_transforms( + video_params["input_height"], + video_params["input_width"], + ) + + self.missing_files = [] + self._load_metadata() + self._load_every_frame_from_meta() + + if pre_load_frames: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Pre-loading all frames into CPU..." + ) + self._pre_load_frames() + + def _get_transforms(self, input_height, input_width, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]): + normalize = v2.Normalize(mean=norm_mean, std=norm_std) + return v2.Compose( + [ + v2.Resize(input_height, antialias=True), + v2.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + def _load_metadata(self): + if self.meta_path is not None: + metadata = pd.read_csv( + self.meta_path, + on_bad_lines="skip", + encoding="ISO-8859-1", + engine="python", + sep=",", + ) + self.metadata = metadata + self.metadata.dropna(inplace=True) + else: + self.metadata = glob.glob(os.path.join(self.data_dir, '**', '*.mp4'), recursive=True) + + def _load_every_frame_from_meta(self): + decord.bridge.set_bridge("torch") + self.frames_batch = [] + for video_idx in range(len(self.metadata)): + try: + sample = self.metadata.iloc[video_idx] + video_fp, _ = self._get_video_path(sample) + except: + video_fp = self.metadata[video_idx] + if os.path.isfile(video_fp): + video_reader = decord.VideoReader(video_fp, num_threads=0) + total_frames = len(video_reader) + fps = video_reader.get_avg_fps() # float + interval = round(fps / self.video_params["sample_fps"]) + frame_ids = list(range(0, total_frames, interval)) + + if self.read_long_video: + video_length = len(frame_ids) + if self.is_causal and video_length > self.chunk_size: + num_frames_ids = frame_ids[:self.chunk_size * ((video_length - 1) // self.chunk_size) + 1] + elif not self.is_causal and video_length >= self.chunk_size: + num_frames_ids = frame_ids[:self.chunk_size * (video_length // self.chunk_size)] + else: + continue + self.frames_batch.append( + { + "video_fp": video_fp, + "num_frames_ids": num_frames_ids, + } + ) + else: + for x in range(0, len(frame_ids), self.video_params["sample_num_frames"]): + num_frames_ids = frame_ids[x : x + self.video_params["sample_num_frames"]] + if len(num_frames_ids) < self.video_params["sample_num_frames"]: + if self.last_frames_handle == "repeat": + num_frames_ids += [num_frames_ids[-1]] * ( + self.video_params["sample_num_frames"] - len(num_frames_ids) + ) + elif self.last_frames_handle == "drop": + continue + else: + raise ValueError(f"Invalid last_frames_handle: {self.last_frames_handle}") + self.frames_batch.append( + { + "video_fp": video_fp, + "num_frames_ids": num_frames_ids, + } + ) + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Loaded all frames index from {len(self.metadata)} videos." + ) + + def _pre_load_frames(self): + last_video_fp = None + for idx in trange(len(self.frames_batch), desc="Pre-loading all frames"): + if self.frames_batch[idx]["video_fp"] != last_video_fp: + video_reader = decord.VideoReader(self.frames_batch[idx]["video_fp"], num_threads=0) + last_video_fp = self.frames_batch[idx]["video_fp"] + self.frames_batch[idx]["frames"] = ( + video_reader.get_batch(self.frames_batch[idx]["num_frames_ids"]).permute(0, 3, 1, 2).float() + / 255.0 + ) + + def _get_video_path(self, sample): + """reduce the access to the disk""" + rel_video_fp = str(sample["videos"]) + abs_video_fp = os.path.join(self.data_dir, rel_video_fp) + return abs_video_fp, rel_video_fp + + def __len__(self): + return len(self.frames_batch) + + def __getitem__(self, item): + video_fp = self.frames_batch[item]["video_fp"] + + try: + if "frames" in self.frames_batch[item]: + imgs = self.frames_batch[item]["frames"] + elif os.path.isfile(video_fp): + video_reader = decord.VideoReader(video_fp, num_threads=0) + imgs = ( + video_reader.get_batch(self.frames_batch[item]["num_frames_ids"]).permute(0, 3, 1, 2).float() + / 255.0 + ) + else: + # if the video file is missing + if video_fp not in self.missing_files: + self.missing_files.append(video_fp) + # resample another video or not + if self.skip_missing_files: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: missing video file {video_fp}. Resampling another video." + ) + return self.__getitem__(np.random.choice(self.__len__())) + else: + raise ValueError(f"Video file {video_fp} is missing, skip_missing_files={self.skip_missing_files}.") + except Exception as e: + # if the video exists, but loading failed + if self.is_strict_loading: + raise ValueError( + f"Video loading failed for {video_fp}, is_strict_loading={self.is_strict_loading}." + ) from e + else: + print0( + "[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: using the pure black image as the frame sample" + ) + imgs = Image.new( + "RGB", (self.video_params["input_width"], self.video_params["input_height"]), (0, 0, 0) + ) + imgs = v2.ToTensor()(imgs).unsqueeze(0) + + if self.transforms is not None: + imgs = self.transforms(imgs) + + if not self.read_long_video: + if imgs.shape[0] < self.video_params["sample_num_frames"]: + print0( + f"[bold yellow]\[vidtok.data.vidtok][VidTokValDataset][/bold yellow] Warning: video {video_fp} has less frames {imgs.shape[0]} than sample_num_frames {self.video_params['sample_num_frames']}." + ) + imgs = torch.cat( + [imgs, imgs[-1].unsqueeze(0).repeat(self.video_params["sample_num_frames"] - imgs.shape[0], 1, 1, 1)], + dim=0, + ) + + imgs = imgs.permute(1, 0, 2, 3) # (C, T, H, W) + + return { + "jpg": imgs, + "path": video_fp, + } diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/models/autoencoder.py b/Meissonic/vidtok_cache/VidTok/vidtok/models/autoencoder.py new file mode 100644 index 0000000000000000000000000000000000000000..96da5e6c74621b4f82538453e850567245adbbb0 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/models/autoencoder.py @@ -0,0 +1,517 @@ +import re +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union, Optional, List +from omegaconf import ListConfig +from packaging import version + +import torch +import lightning.pytorch as pl + +from safetensors.torch import load_file as load_safetensors +from vidtok.modules.ema import LitEma +from vidtok.modules.util import (default, get_obj_from_str, + instantiate_from_config, print0) +from vidtok.modules.regularizers import pack_one, unpack_one, rearrange + + +class AbstractAutoencoder(pl.LightningModule): + """ + This is the base class for all autoencoders + """ + + def __init__( + self, + ema_decay: Union[None, float] = None, + monitor: Union[None, str] = None, + mode: Union[None, str] = None, + input_key: str = "jpg", + ): + super().__init__() + + self.input_key = input_key + self.use_ema = ema_decay is not None + self.ema_decay = ema_decay + if monitor is not None: + self.monitor = monitor + if mode is not None: + self.mode = mode + + if version.parse(torch.__version__) >= version.parse("2.0.0"): + self.automatic_optimization = False + + @abstractmethod + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + raise NotImplementedError() + + @abstractmethod + def get_input(self, batch) -> Any: + raise NotImplementedError() + + def on_train_batch_end(self, *args, **kwargs): + # for EMA computation + if self.use_ema: + self.model_ema(self) + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Switched to EMA weights" + ) + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Restored training weights" + ) + + @abstractmethod + def encode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class called" + ) + + @abstractmethod + def decode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class called" + ) + + def instantiate_optimizer_from_config(self, params, lr, cfg): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> {cfg['target']} <<< optimizer from config" + ) + return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict())) + + @abstractmethod + def configure_optimizers(self) -> Any: + raise NotImplementedError() + + +class AutoencodingEngine(AbstractAutoencoder): + """ + Base class for all video tokenizers that we train + """ + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + verbose = kwargs.pop("verbose", True) + super().__init__(*args, **kwargs) + + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + self.optimizer_config = default(optimizer_config, {"target": "torch.optim.Adam"}) + self.lr_g_factor = lr_g_factor + self.is_causal = self.encoder.is_causal + + self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds) + self.use_tiling = False + # Decode more latent frames at once + self.num_sample_frames_batch_size = 16 + self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = 256 + self.tile_sample_min_width = 256 + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = 0 # 1 / 8 + self.tile_overlap_factor_width = 0 # 1 / 8 + + if self.use_ema: + self.model_ema = LitEma(self, decay=self.ema_decay) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of {len(list(self.model_ema.buffers()))}." + ) + + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: {ckpt_path}" + ) + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, verbose=verbose) + + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + if path.endswith("ckpt"): + ckpt = torch.load(path, map_location="cpu") + weights = ckpt["state_dict"] if "state_dict" in ckpt else ckpt + elif path.endswith("safetensors"): + weights = load_safetensors(path) + else: + raise NotImplementedError(f"Unknown checkpoint: {path}") + + keys = list(weights.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key {k} from state_dict." + ) + del weights[k] + + missing, unexpected = self.load_state_dict(weights, strict=False) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if verbose: + if len(missing) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: {missing}" + ) + if len(unexpected) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: {unexpected}" + ) + + def get_input(self, batch: Dict) -> torch.Tensor: + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(filter(lambda p: p.requires_grad, self.encoder.parameters())) + + list(filter(lambda p: p.requires_grad, self.decoder.parameters())) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + self.use_tiling = False + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + if self.use_tiling: + z = self.tile_encode(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + else: + z = self.encoder(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + + if return_reg_log: + return z, reg_log + return z + + def tile_encode(self, x: Any) -> Any: + + num_frames, height, width = x.shape[-3:] + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = [[0, num_frames]] + result_z = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile = self.encoder(tile) + result_z.append(tile) + + row.append(torch.cat(result_z, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + enc = torch.cat(result_rows, dim=3) + + return enc + + def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + token_indices = rearrange(token_indices, "... -> ... 1") + token_indices, ps = pack_one(token_indices, "b * d") + codes = self.regularization.indices_to_codes(token_indices) + codes = rearrange(codes, "b d n c -> b n (c d)") + z = self.regularization.project_out(codes) + z = unpack_one(z, ps, "b * d") + z = rearrange(z, "b ... d -> b d ...") + return z + + def decode(self, z: Any, decode_from_indices: bool = False) -> torch.Tensor: + if decode_from_indices: + z = self.indices_to_latent(z) + if self.use_tiling: + x = self.tile_decode(z) + else: + x = self.decoder(z) + return x + + def tile_decode(self, z: Any) -> torch.Tensor: + + num_frames, height, width = z.shape[-3:] + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = [[0, num_frames]] + time = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + tile = z[ + :, + :, + start_frame : end_frame, + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + tile = self.decoder(tile) + if self.is_causal and end_frame + 1 <= num_frames: + tile = tile[:, :, : -self.encoder.time_downsample_factor] + time.append(tile) + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + return dec + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.encoder.fix_encoder: + with torch.no_grad(): + z, reg_log = self.encode(x, return_reg_log=True) + else: + z, reg_log = self.encode(x, return_reg_log=True) + + dec = self.decode(z) + return z, dec, reg_log + + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + opt_g, opt_d = self.optimizers() + + # autoencode loss + self.toggle_optimizer(opt_g) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + + # gradient clip + torch.nn.utils.clip_grad_norm_(self.get_autoencoder_params(), 20.0) + opt_g.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + torch.nn.utils.clip_grad_norm_(self.get_discriminator_params(), 20.0) + opt_d.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + lr = opt_g.param_groups[0]["lr"] + self.log( + "lr_abs", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + ) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + return log_dict_ae + + def configure_optimizers(self) -> Any: + ae_params = self.get_autoencoder_params() + disc_params = self.get_discriminator_params() + + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + opt_disc = self.instantiate_optimizer_from_config(disc_params, self.learning_rate, self.optimizer_config) + + return [opt_ae, opt_disc], [] + + @torch.no_grad() + def log_images(self, batch: Dict) -> Dict: + log = dict() + x = self.get_input(batch) + _, xrec, _ = self(x) + log["inputs"] = x + log["recs"] = xrec + with self.ema_scope(): + _, xrec_ema, _ = self(x) + log["recs_ema"] = xrec_ema + return log diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/models/autoencoder_v1_1.py b/Meissonic/vidtok_cache/VidTok/vidtok/models/autoencoder_v1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1182f573eee05b634daeb9a065b7adaacc600b --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/models/autoencoder_v1_1.py @@ -0,0 +1,588 @@ +import re +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union, Optional, List +from omegaconf import ListConfig +from packaging import version + +import torch +import lightning.pytorch as pl + +from safetensors.torch import load_file as load_safetensors +from vidtok.modules.ema import LitEma +from vidtok.modules.util import (default, get_obj_from_str, + instantiate_from_config, print0) +from vidtok.modules.regularizers import pack_one, unpack_one, rearrange + + +class AbstractAutoencoder(pl.LightningModule): + """ + This is the base class for all autoencoders + """ + + def __init__( + self, + ema_decay: Union[None, float] = None, + monitor: Union[None, str] = None, + mode: Union[None, str] = None, + input_key: str = "jpg", + ): + super().__init__() + + self.input_key = input_key + self.use_ema = ema_decay is not None + self.ema_decay = ema_decay + if monitor is not None: + self.monitor = monitor + if mode is not None: + self.mode = mode + + if version.parse(torch.__version__) >= version.parse("2.0.0"): + self.automatic_optimization = False + + @abstractmethod + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + raise NotImplementedError() + + @abstractmethod + def get_input(self, batch) -> Any: + raise NotImplementedError() + + def on_train_batch_end(self, *args, **kwargs): + # for EMA computation + if self.use_ema: + self.model_ema(self) + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Switched to EMA weights" + ) + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] {context}: Restored training weights" + ) + + @abstractmethod + def encode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class called" + ) + + @abstractmethod + def decode(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class called" + ) + + def instantiate_optimizer_from_config(self, params, lr, cfg): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> {cfg['target']} <<< optimizer from config" + ) + return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict())) + + @abstractmethod + def configure_optimizers(self) -> Any: + raise NotImplementedError() + + +class AutoencodingEngine(AbstractAutoencoder): + """ + Base class for all video tokenizers that we train + """ + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + use_tiling: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + verbose = kwargs.pop("verbose", True) + self.use_tiling = kwargs.pop("use_tiling", False) + self.t_chunk_enc = kwargs.pop("t_chunk_enc", 16) + super().__init__(*args, **kwargs) + + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + self.optimizer_config = default(optimizer_config, {"target": "torch.optim.Adam"}) + self.lr_g_factor = lr_g_factor + + self.t_chunk_dec = self.t_chunk_enc // self.encoder.time_downsample_factor + self.use_overlap = False + self.is_causal = self.encoder.is_causal + + self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds) + + self.use_tiling = use_tiling + # Decode more latent frames at once + self.num_sample_frames_batch_size = 16 + self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio + + # We make the minimum height and width of sample for tiling half that of the generally supported + self.tile_sample_min_height = 256 + self.tile_sample_min_width = 256 + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = 0 # 1 / 8 + self.tile_overlap_factor_width = 0 # 1 / 8 + + if self.use_ema: + self.model_ema = LitEma(self, decay=self.ema_decay) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of {len(list(self.model_ema.buffers()))}." + ) + + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: {ckpt_path}" + ) + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, verbose=verbose) + + def init_from_ckpt(self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple(), verbose: bool = True) -> None: + if path.endswith("ckpt"): + ckpt = torch.load(path, map_location="cpu") + weights = ckpt["state_dict"] if "state_dict" in ckpt else ckpt + elif path.endswith("safetensors"): + weights = load_safetensors(path) + else: + raise NotImplementedError(f"Unknown checkpoint: {path}") + + keys = list(weights.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key {k} from state_dict." + ) + del weights[k] + + missing, unexpected = self.load_state_dict(weights, strict=False) + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if verbose: + if len(missing) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: {missing}" + ) + if len(unexpected) > 0: + print0( + f"[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: {unexpected}" + ) + + def get_input(self, batch: Dict) -> torch.Tensor: + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(filter(lambda p: p.requires_grad, self.encoder.parameters())) + + list(filter(lambda p: p.requires_grad, self.decoder.parameters())) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + def _empty_causal_cached(self, parent): + for name, module in parent.named_modules(): + if hasattr(module, 'causal_cache'): + module.causal_cache = None + + def _set_first_chunk(self, is_first_chunk=True): + for module in self.modules(): + if hasattr(module, 'is_first_chunk'): + module.is_first_chunk = is_first_chunk + + def _set_cache_offset(self, modules, cache_offset=0): + for module in modules: + for submodule in module.modules(): + if hasattr(submodule, 'cache_offset'): + submodule.cache_offset = cache_offset + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for y in range(blend_extent): + b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * ( + y / blend_extent + ) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[4], b.shape[4], blend_extent) + for x in range(blend_extent): + b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * ( + x / blend_extent + ) + return b + + def build_chunk_start_end(self, t, decoder_mode=False): + start_end = [[0, 1]] + start = 1 + end = start + while True: + if start >= t: + break + end = min(t, end + (self.t_chunk_dec if decoder_mode else self.t_chunk_enc)) + start_end.append([start, end]) + start = end + return start_end + + def enable_tiling( + self, + tile_sample_min_height: Optional[int] = None, + tile_sample_min_width: Optional[int] = None, + tile_overlap_factor_height: Optional[float] = None, + tile_overlap_factor_width: Optional[float] = None, + ) -> None: + self.use_tiling = True + self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds))) + self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds))) + self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height + self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width + + def disable_tiling(self) -> None: + self.use_tiling = False + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + self._empty_causal_cached(self.encoder) + self._set_first_chunk(True) + + if self.use_tiling: + z = self.tile_encode(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + else: + z = self.encoder(x) + z, reg_log = self.regularization(z, n_steps=self.global_step // 2) + + if return_reg_log: + return z, reg_log + return z + + def tile_encode(self, x: Any) -> Any: + + num_frames, height, width = x.shape[-3:] + + overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_latent_min_height - blend_extent_height + row_limit_width = self.tile_latent_min_width - blend_extent_width + rows = [] + + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + start_end = self.build_chunk_start_end(num_frames) + result_z = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + self._set_first_chunk(idx == 0) + tile = x[ + :, + :, + start_frame:end_frame, + i : i + self.tile_sample_min_height, + j : j + self.tile_sample_min_width, + ] + tile = self.encoder(tile) + result_z.append(tile) + row.append(torch.cat(result_z, dim=2)) + rows.append(row) + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + enc = torch.cat(result_rows, dim=3) + + return enc + + def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + assert token_indices.dim() == 4, "token_indices should be of shape (b, t, h, w)" + b, t, h, w = token_indices.shape + token_indices = token_indices.unsqueeze(-1).reshape(b, -1, 1) + codes = self.regularization.indices_to_codes(token_indices) + codes = codes.permute(0, 2, 3, 1).reshape(b, codes.shape[2], -1) + z = self.regularization.project_out(codes) + return z.reshape(b, t, h, w, -1).permute(0, 4, 1, 2, 3) + + def tile_indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor: + num_frames = token_indices.shape[1] + start_end = self.build_chunk_start_end(num_frames, decoder_mode=True) + result_z = [] + for (start, end) in start_end: + chunk = token_indices[:, start:end, :, :] + chunk_z = self.indices_to_latent(chunk) + result_z.append(chunk_z.clone()) + return torch.cat(result_z, dim=2) + + def decode(self, z: Any, decode_from_indices: bool = False) -> torch.Tensor: + if decode_from_indices: + if self.use_tiling: + z = self.tile_indices_to_latent(z) + else: + z = self.indices_to_latent(z) + self._empty_causal_cached(self.decoder) + self._set_first_chunk(True) + + if self.use_tiling: + x = self.tile_decode(z) + else: + x = self.decoder(z) + return x + + + def tile_decode(self, z: Any) -> torch.Tensor: + + num_frames, height, width = z.shape[-3:] + + overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height)) + overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width)) + blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height) + blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width) + row_limit_height = self.tile_sample_min_height - blend_extent_height + row_limit_width = self.tile_sample_min_width - blend_extent_width + + # Split z into overlapping tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, height, overlap_height): + row = [] + for j in range(0, width, overlap_width): + if self.is_causal: + assert self.encoder.time_downsample_factor in [2, 4, 8], "Only support 2x, 4x or 8x temporal downsampling now." + if self.encoder.time_downsample_factor == 4: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 2) + self._set_cache_offset([self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out], 4) + elif self.encoder.time_downsample_factor == 2: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1], self.decoder.up_temporal[0], self.decoder.conv_out], 2) + else: + self._set_cache_offset([self.decoder], 1) + self._set_cache_offset([self.decoder.up_temporal[3].upsample, self.decoder.up_temporal[2]], 2) + self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 4) + self._set_cache_offset([self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out], 8) + + start_end = self.build_chunk_start_end(num_frames, decoder_mode=True) + time = [] + for idx, (start_frame, end_frame) in enumerate(start_end): + self._set_first_chunk(idx == 0) + tile = z[ + :, + :, + start_frame : (end_frame + 1 if self.is_causal and end_frame + 1 <= num_frames else end_frame), + i : i + self.tile_latent_min_height, + j : j + self.tile_latent_min_width, + ] + tile = self.decoder(tile) + if self.is_causal and end_frame + 1 <= num_frames: + tile = tile[:, :, : -self.encoder.time_downsample_factor] + time.append(tile) + row.append(torch.cat(time, dim=2)) + rows.append(row) + + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent_width) + result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width]) + result_rows.append(torch.cat(result_row, dim=4)) + + dec = torch.cat(result_rows, dim=3) + return dec + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.encoder.fix_encoder: + with torch.no_grad(): + z, reg_log = self.encode(x, return_reg_log=True) + else: + z, reg_log = self.encode(x, return_reg_log=True) + dec = self.decode(z) + if dec.shape[2] != x.shape[2]: + dec = dec[:, :, -x.shape[2]:, ...] + return z, dec, reg_log + + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + opt_g, opt_d = self.optimizers() + + # autoencode loss + self.toggle_optimizer(opt_g) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + + # gradient clip + torch.nn.utils.clip_grad_norm_(self.get_autoencoder_params(), 20.0) + opt_g.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + torch.nn.utils.clip_grad_norm_(self.get_discriminator_params(), 20.0) + opt_d.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + lr = opt_g.param_groups[0]["lr"] + self.log( + "lr_abs", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + ) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + if x.ndim == 4: + x = x.unsqueeze(2) + + z, xrec, regularization_log = self(x) + + if x.ndim == 5 and xrec.ndim == 4: + xrec = xrec.unsqueeze(2) + + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + return log_dict_ae + + def configure_optimizers(self) -> Any: + ae_params = self.get_autoencoder_params() + disc_params = self.get_discriminator_params() + + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + opt_disc = self.instantiate_optimizer_from_config(disc_params, self.learning_rate, self.optimizer_config) + + return [opt_ae, opt_disc], [] + + @torch.no_grad() + def log_images(self, batch: Dict) -> Dict: + log = dict() + x = self.get_input(batch) + _, xrec, _ = self(x) + log["inputs"] = x + log["recs"] = xrec + with self.ema_scope(): + _, xrec_ema, _ = self(x) + log["recs_ema"] = xrec_ema + return log \ No newline at end of file diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc b/Meissonic/vidtok_cache/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33f2b721c289174afb26321194a97156b77c328b Binary files /dev/null and b/Meissonic/vidtok_cache/VidTok/vidtok/modules/__pycache__/util.cpython-310.pyc differ diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/discriminator.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/discriminator.py new file mode 100644 index 0000000000000000000000000000000000000000..f9d94b21b22f5019f3cdcc4cbf2e98bd0ce0ee02 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/discriminator.py @@ -0,0 +1,201 @@ +import functools + +import torch +import torch.nn as nn + + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + nn.init.normal_(m.weight.data, 0.0, 0.02) + elif classname.find("BatchNorm") != -1: + nn.init.normal_(m.weight.data, 1.0, 0.02) + nn.init.constant_(m.bias.data, 0) + + +class ActNorm(nn.Module): + def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False): + assert affine + super().__init__() + self.logdet = logdet + self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1)) + self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1)) + self.allow_reverse_init = allow_reverse_init + + self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8)) + + def initialize(self, input): + with torch.no_grad(): + flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1) + mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3) + std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3) + + self.loc.data.copy_(-mean) + self.scale.data.copy_(1 / (std + 1e-6)) + + def forward(self, input, reverse=False): + if reverse: + return self.reverse(input) + if len(input.shape) == 2: + input = input[:, :, None, None] + squeeze = True + else: + squeeze = False + + _, _, height, width = input.shape + + if self.training and self.initialized.item() == 0: + self.initialize(input) + self.initialized.fill_(1) + + h = self.scale * (input + self.loc) + + if squeeze: + h = h.squeeze(-1).squeeze(-1) + + if self.logdet: + log_abs = torch.log(torch.abs(self.scale)) + logdet = height * width * torch.sum(log_abs) + logdet = logdet * torch.ones(input.shape[0]).to(input) + return h, logdet + + return h + + def reverse(self, output): + if self.training and self.initialized.item() == 0: + if not self.allow_reverse_init: + raise RuntimeError( + "Initializing ActNorm in reverse direction is " + "disabled by default. Use allow_reverse_init=True to enable." + ) + else: + self.initialize(output) + self.initialized.fill_(1) + + if len(output.shape) == 2: + output = output[:, :, None, None] + squeeze = True + else: + squeeze = False + + h = output / self.scale - self.loc + + if squeeze: + h = h.squeeze(-1).squeeze(-1) + return h + + +class NLayerDiscriminator(nn.Module): + """Defines a PatchGAN discriminator as in Pix2Pix.""" + # https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py + def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): + """Construct a PatchGAN discriminator + Parameters: + input_nc (int) -- the number of channels in input images + ndf (int) -- the number of filters in the last conv layer + n_layers (int) -- the number of conv layers in the discriminator + """ + super(NLayerDiscriminator, self).__init__() + if not use_actnorm: + norm_layer = nn.BatchNorm2d + else: + norm_layer = ActNorm + if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters + use_bias = norm_layer.func != nn.BatchNorm2d + else: + use_bias = norm_layer != nn.BatchNorm2d + + kw = 4 + padw = 1 + sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + nf_mult = 1 + nf_mult_prev = 1 + for n in range(1, n_layers): # gradually increase the number of filters + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + sequence += [ + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + nf_mult_prev = nf_mult + nf_mult = min(2**n_layers, 8) + sequence += [ + nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + sequence += [ + nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + ] # output 1 channel prediction map + self.main = nn.Sequential(*sequence) + + def forward(self, input): + """Standard forward.""" + return self.main(input) + + +class NLayerDiscriminator3D(nn.Module): + """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs.""" + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/losses/discriminator.py + def __init__(self, input_nc=1, ndf=64, n_layers=3, use_actnorm=False): + """ + Construct a 3D PatchGAN discriminator + + Parameters: + input_nc (int) -- the number of channels in input volumes + ndf (int) -- the number of filters in the last conv layer + n_layers (int) -- the number of conv layers in the discriminator + use_actnorm (bool) -- flag to use actnorm instead of batchnorm + """ + super(NLayerDiscriminator3D, self).__init__() + if not use_actnorm: + norm_layer = nn.BatchNorm3d + else: + raise NotImplementedError("Not implemented.") + if type(norm_layer) == functools.partial: + use_bias = norm_layer.func != nn.BatchNorm3d + else: + use_bias = norm_layer != nn.BatchNorm3d + + kw = 3 + padw = 1 + sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] + nf_mult = 1 + nf_mult_prev = 1 + for n in range(1, n_layers): # gradually increase the number of filters + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + sequence += [ + nn.Conv3d( + ndf * nf_mult_prev, + ndf * nf_mult, + kernel_size=(kw, kw, kw), + stride=(2 if n == 1 else 1, 2, 2), + padding=padw, + bias=use_bias, + ), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + nf_mult_prev = nf_mult + nf_mult = min(2**n_layers, 8) + sequence += [ + nn.Conv3d( + ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=1, padding=padw, bias=use_bias + ), + norm_layer(ndf * nf_mult), + nn.LeakyReLU(0.2, True), + ] + + sequence += [ + nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + ] # output 1 channel prediction map + self.main = nn.Sequential(*sequence) + + def forward(self, input): + """Standard forward.""" + return self.main(input) diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/distributions.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..76e814475d4d32b9f5ead736cce3a234bb5a0e5f --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/distributions.py @@ -0,0 +1,49 @@ +import numpy as np +import torch + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.0]) + else: + if other is None: + return 0.5 * torch.sum( + torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3], + ) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + dim=[1, 2, 3], + ) + + def nll(self, sample, dims=[1, 2, 3]): + if self.deterministic: + return torch.Tensor([0.0]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims, + ) + + def mode(self): + return self.mean diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/ema.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..9f1f7606c2c9b68ebd2302215a9e08f9f31ed8ab --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/ema.py @@ -0,0 +1,82 @@ +import torch +from torch import nn + + +class LitEma(nn.Module): + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError("Decay must be between 0 and 1") + + self.m_name2s_name = {} + self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32)) + self.register_buffer( + "num_updates", + torch.tensor(0, dtype=torch.int) if use_num_upates else torch.tensor(-1, dtype=torch.int), + ) + + for name, p in model.named_parameters(): + if p.requires_grad: + # remove as '.'-character is not allowed in buffers + s_name = name.replace(".", "") + self.m_name2s_name.update({name: s_name}) + self.register_buffer(s_name, p.clone().detach().data) + + self.collected_params = [] + + def reset_num_updates(self): + del self.num_updates + self.register_buffer("num_updates", torch.tensor(0, dtype=torch.int)) + + def forward(self, model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) + + one_minus_decay = 1.0 - decay + + with torch.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if m_param[key].requires_grad: + sname = self.m_name2s_name[key] + shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) + shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) + else: + assert not key in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if m_param[key].requires_grad: + m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) + else: + assert not key in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.data.copy_(c_param.data) diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/logger.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..23c7d066e6508433cb2141c59dbd80cb0030ab6d --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/logger.py @@ -0,0 +1,289 @@ +import os +import numpy as np +import einops +import imageio +from typing import Union +from matplotlib import pyplot as plt +from PIL import Image, ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True # UnidentifiedImageError: https://github.com/python-pillow/Pillow/issues/5631 +from pathlib import Path + +import torch +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +from .util import exists, isheatmap + + +class ImageVideoLogger(Callback): + def __init__( + self, + batch_frequency, + max_samples, + clamp=True, + increase_log_steps=True, + batch_frequency_val=None, + video_fps=8, + rescale=True, + disabled=False, + log_on_batch_idx=True, # log on batch_idx instead of global_step. global_step is fixed in validation. batch_idx restarts at each validation + log_first_step=True, + log_images_kwargs=None, + log_videos_kwargs=None, + log_before_first_step=True, + enable_autocast=True, + ): + super().__init__() + self.enable_autocast = enable_autocast + self.rescale = rescale + self.batch_freq = batch_frequency + self.batch_freq_val = batch_frequency_val if batch_frequency_val is not None else batch_frequency + self.video_fps = video_fps + self.max_samples = max_samples + self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)] + if not increase_log_steps: + self.log_steps = [self.batch_freq] + self.clamp = clamp + self.disabled = disabled + self.log_on_batch_idx = log_on_batch_idx + self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} + self.log_videos_kwargs = log_videos_kwargs if log_videos_kwargs else {} + self.log_first_step = log_first_step + self.log_before_first_step = log_before_first_step + + @rank_zero_only + def log_img_local( + self, + save_dir, + split, + images, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "images", split) + for k in images: + if isheatmap(images[k]): + fig, ax = plt.subplots() + ax = ax.matshow( + images[k].cpu().numpy(), cmap="hot", interpolation="lanczos" + ) + plt.colorbar(ax) + plt.axis("off") + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + plt.savefig(path) + plt.close() + else: + if images[k].ndim == 5: + images[k] = einops.rearrange(images[k], "b c t h w -> (b t) c h w") + nrow = self.log_images_kwargs.get("n_rows", 8) + grid = torchvision.utils.make_grid(images[k], nrow=nrow) + if self.rescale: + grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) + grid = grid.numpy() + grid = (grid * 255).astype(np.uint8) + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format( + k, global_step, current_epoch, batch_idx + ) + path = os.path.join(root, filename) + os.makedirs(os.path.split(path)[0], exist_ok=True) + img = Image.fromarray(grid) + img.save(path) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "logger_log_image only supports WandbLogger currently" + pl_module.logger.log_image( + key=f"{split}/{k}", + images=[ + img, + ], + step=pl_module.global_step, + ) + + @rank_zero_only + def log_vid_local( + self, + save_dir, + split, + videos, + global_step, + current_epoch, + batch_idx, + pl_module: Union[None, pl.LightningModule] = None, + ): + root = os.path.join(save_dir, "videos", split) + for k in videos: + # if is video, we can add captions + if isinstance(videos[k], torch.Tensor) and videos[k].ndim == 5: + if self.rescale: + videos[k] = (videos[k] + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + frames = [videos[k][:, :, i] for i in range(videos[k].shape[2])] + frames = [torchvision.utils.make_grid(each, nrow=4) for each in frames] + frames = [einops.rearrange(each, "c h w -> 1 c h w") for each in frames] + frames = torch.clamp(torch.cat(frames, dim=0), min=0.0, max=1.0) + frames = (frames.numpy() * 255).astype(np.uint8) + + filename = "{}_gs-{:06}_e-{:06}_b-{:06}.gif".format( + k, global_step, current_epoch, batch_idx + ) + os.makedirs(root, exist_ok=True) + path = os.path.join(root, filename) + save_numpy_as_gif(frames, path, duration=1 / self.video_fps) + if exists(pl_module): + assert isinstance( + pl_module.logger, WandbLogger + ), "log_videos only supports WandbLogger currently" + wandb.log({f"{split}/{k}": wandb.Video(frames, fps=self.video_fps)}) # k is str + + @rank_zero_only + def log_img(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + (self.check_frequency(check_idx) or self.check_frequency_val(batch_idx, split)) + and hasattr(pl_module, "log_images") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_images) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + with torch.no_grad(), torch.autocast(enabled=self.enable_autocast, device_type="cuda"): + images = pl_module.log_images(batch) + + for k in images: + N = min(images[k].shape[0], self.max_samples) + if not isheatmap(images[k]): + images[k] = images[k][:N] + if isinstance(images[k], torch.Tensor): + images[k] = images[k].detach().float().cpu() + if self.clamp and not isheatmap(images[k]): + images[k] = torch.clamp(images[k], -1.0, 1.0) + + self.log_img_local( + pl_module.logger.save_dir, + split, + images, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + @rank_zero_only + def log_vid(self, pl_module, batch, batch_idx, split="train"): + check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step + if ( + (self.check_frequency(check_idx) or self.check_frequency_val(batch_idx, split)) + and hasattr(pl_module, "log_videos") # batch_idx % self.batch_freq == 0 + and callable(pl_module.log_videos) + and self.max_samples > 0 + ): + logger = type(pl_module.logger) + is_train = pl_module.training + if is_train: + pl_module.eval() + + with torch.no_grad(), torch.autocast(enabled=self.enable_autocast, device_type="cuda"): + videos = pl_module.log_videos( + batch, split=split, **self.log_videos_kwargs + ) + + for k in videos: + N = min(videos[k].shape[0], self.max_samples) + videos[k] = videos[k][:N] + if isinstance(videos[k], torch.Tensor): + videos[k] = videos[k].detach().float().cpu() + if self.clamp: + videos[k] = torch.clamp(videos[k], -1.0, 1.0) + + self.log_vid_local( + pl_module.logger.save_dir, + split, + videos, + pl_module.global_step, + pl_module.current_epoch, + batch_idx, + pl_module=pl_module + if isinstance(pl_module.logger, WandbLogger) + else None, + ) + + if is_train: + pl_module.train() + + def check_frequency(self, check_idx): + if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( + check_idx > 0 or self.log_first_step + ): + try: + self.log_steps.pop(0) + except IndexError as e: + pass + return True + return False + + def check_frequency_val(self, check_idx, split): + if 'val' in split: + if ((check_idx % self.batch_freq_val) == 0) and ( + check_idx > 0 or self.log_first_step): + return True + return False + + @rank_zero_only + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): + self.log_img(pl_module, batch, batch_idx, split="train") + self.log_vid(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + if self.log_before_first_step and pl_module.global_step == 0: + self.log_img(pl_module, batch, batch_idx, split="train") + self.log_vid(pl_module, batch, batch_idx, split="train") + + @rank_zero_only + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs + ): + if not self.disabled and pl_module.global_step > 0: + self.log_img(pl_module, batch, batch_idx, split="val") + self.log_vid(pl_module, batch, batch_idx, split="val") + if hasattr(pl_module, "calibrate_grad_norm"): + if ( + pl_module.calibrate_grad_norm and batch_idx % 25 == 0 + ) and batch_idx > 0: + self.log_gradients(trainer, pl_module, batch_idx=batch_idx) + + +def save_numpy_as_gif(frames, path, duration=None): + """ + save numpy array as gif file + """ + image_list = [] + for frame in frames: + image = frame.transpose(1, 2, 0) + image_list.append(image) + if duration: + imageio.mimsave(path, image_list, format="GIF", duration=duration, loop=0) + else: + imageio.mimsave(path, image_list, format="GIF", loop=0) diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/losses.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..c96f471c72124c56025b40a17f7d7bda81446ca5 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/losses.py @@ -0,0 +1,262 @@ +from typing import Any, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .discriminator import (NLayerDiscriminator, NLayerDiscriminator3D, + weights_init) +from .lpips import LPIPS +from .util import default, print0 + + +def hinge_d_loss(logits_real, logits_fake): + loss_real = torch.mean(F.relu(1.0 - logits_real)) + loss_fake = torch.mean(F.relu(1.0 + logits_fake)) + d_loss = 0.5 * (loss_real + loss_fake) + return d_loss + + +def vanilla_d_loss(logits_real, logits_fake): + d_loss = 0.5 * (torch.mean(F.softplus(-logits_real)) + torch.mean(F.softplus(logits_fake))) + return d_loss + + +def adopt_weight(weight, global_step, threshold=0, value=0.0): + if global_step < threshold: + weight = value + return weight + + +def _sigmoid_cross_entropy_with_logits(labels, logits): + """ + non-saturating loss + """ + zeros = torch.zeros_like(logits, dtype=logits.dtype) + condition = logits >= zeros + relu_logits = torch.where(condition, logits, zeros) + neg_abs_logits = torch.where(condition, -logits, logits) + return relu_logits - logits * labels + torch.log1p(torch.exp(neg_abs_logits)) + + +def non_saturate_gen_loss(logits_fake): + """ + logits_fake: [B 1 H W] + """ + B = logits_fake.shape[0] + logits_fake = logits_fake.reshape(B, -1) + logits_fake = torch.mean(logits_fake, dim=-1) + gen_loss = torch.mean(_sigmoid_cross_entropy_with_logits(labels=torch.ones_like(logits_fake), logits=logits_fake)) + return gen_loss + + +def lecam_reg(real_pred, fake_pred, lecam_ema): + reg = torch.mean(F.relu(real_pred - lecam_ema.logits_fake_ema).pow(2)) + torch.mean( + F.relu(lecam_ema.logits_real_ema - fake_pred).pow(2) + ) + return reg + + +class LeCAM_EMA(object): + # https://github.com/TencentARC/SEED-Voken/blob/main/src/Open_MAGVIT2/modules/losses/vqperceptual.py + def __init__(self, init=0.0, decay=0.999): + self.logits_real_ema = init + self.logits_fake_ema = init + self.decay = decay + + def update(self, logits_real, logits_fake): + self.logits_real_ema = self.logits_real_ema * self.decay + torch.mean(logits_real).item() * (1 - self.decay) + self.logits_fake_ema = self.logits_fake_ema * self.decay + torch.mean(logits_fake).item() * (1 - self.decay) + + +class GeneralLPIPSWithDiscriminator(nn.Module): + def __init__( + self, + disc_start: int, + logvar_init: float = 0.0, + pixelloss_weight=1.0, + disc_num_layers: int = 3, + disc_in_channels: int = 3, + disc_factor: float = 1.0, + disc_weight: float = 1.0, + disc_type: str = "3d", + perceptual_weight: float = 1.0, + lecam_loss_weight: float = 0.0, + disc_loss: str = "hinge", + scale_input_to_tgt_size: bool = False, + dims: int = 2, + learn_logvar: bool = False, + regularization_weights: Union[None, dict] = None, + gen_loss_cross_entropy: bool = False, + ): + super().__init__() + self.dims = dims + if self.dims > 2: + print0( + f"[bold cyan]\[vidtok.modules.losses][GeneralLPIPSWithDiscriminator][/bold cyan] running with dims={dims}. This means that for perceptual loss calculation, " + f"the LPIPS loss will be applied to each frame independently. " + ) + self.scale_input_to_tgt_size = scale_input_to_tgt_size + assert disc_loss in ["hinge", "vanilla"] + self.pixel_weight = pixelloss_weight + self.perceptual_loss = LPIPS().eval() + self.perceptual_weight = perceptual_weight + # output log variance + self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) + self.learn_logvar = learn_logvar + self.disc_type = disc_type + assert self.disc_type in ["2d", "3d"] + + if self.disc_type == "2d": + self.discriminator = NLayerDiscriminator( + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False + ).apply(weights_init) + else: + self.discriminator = NLayerDiscriminator3D( + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=False + ).apply(weights_init) + self.discriminator_iter_start = disc_start + self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss + self.disc_factor = disc_factor + self.discriminator_weight = disc_weight + self.regularization_weights = default(regularization_weights, {}) + self.gen_loss_cross_entropy = gen_loss_cross_entropy + self.lecam_loss_weight = lecam_loss_weight + if self.lecam_loss_weight > 0: + self.lecam_ema = LeCAM_EMA() + + def get_trainable_parameters(self) -> Any: + return self.discriminator.parameters() + + def get_trainable_autoencoder_parameters(self) -> Any: + if self.learn_logvar: + yield self.logvar + yield from () + + def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): + if last_layer is not None: + nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] + g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] + else: + nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] + g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] + + d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) + d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() + d_weight = d_weight * self.discriminator_weight + return d_weight + + def forward( + self, + regularization_log, + inputs, + reconstructions, + optimizer_idx, + global_step, + last_layer=None, + split="train", + weights=None, + ): + if self.scale_input_to_tgt_size: + inputs = torch.nn.functional.interpolate(inputs, reconstructions.shape[2:], mode="bicubic", antialias=True) + + if optimizer_idx == 0: + bs = inputs.shape[0] + t = inputs.shape[2] + if self.dims > 2: + inputs, reconstructions = map( + lambda x: rearrange(x, "b c t h w -> (b t) c h w"), + (inputs, reconstructions), + ) + + rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) + if self.perceptual_weight > 0: + p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) + rec_loss = rec_loss + self.perceptual_weight * p_loss + else: + p_loss = torch.Tensor([0.0]) + + nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar + weighted_nll_loss = nll_loss + if weights is not None: + weighted_nll_loss = weights * nll_loss + weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] + nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] + + # now the GAN part + if self.disc_type == "3d": + reconstructions = rearrange(reconstructions, "(b t) c h w -> b c t h w", t=t).contiguous() + + # generator update + logits_fake = self.discriminator(reconstructions) + + if not self.gen_loss_cross_entropy: + g_loss = -torch.mean(logits_fake) + else: + g_loss = non_saturate_gen_loss(logits_fake) + + if self.disc_factor > 0.0: + try: + d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) + except RuntimeError: + assert not self.training + d_weight = torch.tensor(0.0) + else: + d_weight = torch.tensor(0.0) + + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + loss = weighted_nll_loss + d_weight * disc_factor * g_loss + log = dict() + for k in regularization_log: + if k in self.regularization_weights: + loss = loss + self.regularization_weights[k] * regularization_log[k] + log[f"{split}/{k}"] = regularization_log[k].detach().mean() + + log.update( + { + "{}/total_loss".format(split): loss.clone().detach().mean(), + "{}/logvar".format(split): self.logvar.detach(), + "{}/nll_loss".format(split): nll_loss.detach().mean(), + "{}/rec_loss".format(split): rec_loss.detach().mean(), + "{}/p_loss".format(split): p_loss.detach().mean(), + "{}/d_weight".format(split): d_weight.detach(), + "{}/disc_factor".format(split): torch.tensor(disc_factor), + "{}/g_loss".format(split): g_loss.detach().mean(), + } + ) + return loss, log + + if optimizer_idx == 1: + if self.disc_type == "2d" and self.dims > 2: + inputs, reconstructions = map( + lambda x: rearrange(x, "b c t h w -> (b t) c h w"), + (inputs, reconstructions), + ) + + logits_real = self.discriminator(inputs.contiguous().detach()) + logits_fake = self.discriminator(reconstructions.contiguous().detach()) + + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + + non_saturate_d_loss = self.disc_loss(logits_real, logits_fake) + + if self.lecam_loss_weight > 0: + self.lecam_ema.update(logits_real, logits_fake) + lecam_loss = lecam_reg(logits_real, logits_fake, self.lecam_ema) + d_loss = disc_factor * (lecam_loss * self.lecam_loss_weight + non_saturate_d_loss) + else: + d_loss = disc_factor * non_saturate_d_loss + + log = { + "{}/disc_loss".format(split): d_loss.clone().detach().mean(), + "{}/logits_real".format(split): logits_real.detach().mean(), + "{}/logits_fake".format(split): logits_fake.detach().mean(), + "{}/disc_factor".format(split): torch.tensor(disc_factor), + "{}/non_saturated_d_loss".format(split): non_saturate_d_loss.detach(), + } + + if self.lecam_loss_weight > 0: + log.update({"{}/lecam_loss".format(split): lecam_loss.detach()}) + + return d_loss, log diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/lpips.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/lpips.py new file mode 100644 index 0000000000000000000000000000000000000000..22bb5fa315618c0d1f0463d67ea57c29083fe302 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/lpips.py @@ -0,0 +1,172 @@ +import hashlib +import os +from collections import namedtuple +from tqdm import tqdm + +import requests +import torch +import torch.nn as nn +from torchvision import models + +from .util import print0 + +URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"} + +CKPT_MAP = {"vgg_lpips": "vgg.pth"} + +MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"} + + +def download(url, local_path, chunk_size=1024): + os.makedirs(os.path.split(local_path)[0], exist_ok=True) + with requests.get(url, stream=True) as r: + total_size = int(r.headers.get("content-length", 0)) + with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: + with open(local_path, "wb") as f: + for data in r.iter_content(chunk_size=chunk_size): + if data: + f.write(data) + pbar.update(chunk_size) + + +def md5_hash(path): + with open(path, "rb") as f: + content = f.read() + return hashlib.md5(content).hexdigest() + + +def get_ckpt_path(name, root, check=False): + assert name in URL_MAP + path = os.path.join(root, CKPT_MAP[name]) + if os.path.exists(path) and not (check and not md5_hash(path) == MD5_MAP[name]): + print0( + "[bold cyan]\[vidtok.modules.lpips]\[get_ckpt_path][/bold cyan] Using existing path for {} model: {}".format( + name, path + ) + ) + return path + + # if not, download the model + print0( + "[bold cyan]\[vidtok.modules.lpips]\[get_ckpt_path][/bold cyan] Downloading {} model from {} to {}".format( + name, URL_MAP[name], path + ) + ) + download(URL_MAP[name], path) + md5 = md5_hash(path) + assert md5 == MD5_MAP[name], md5 + return path + + +class LPIPS(nn.Module): + # Learned perceptual metric + def __init__(self, use_dropout=True): + super().__init__() + self.scaling_layer = ScalingLayer() + self.chns = [64, 128, 256, 512, 512] # vg16 features + self.net = vgg16(pretrained=True, requires_grad=False) + self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) + self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout) + self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout) + self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout) + self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout) + self.load_from_pretrained() + for param in self.parameters(): + param.requires_grad = False + + def load_from_pretrained(self, name="vgg_lpips"): + ckpt = get_ckpt_path(name, "checkpoints/lpips") + self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False) + print0("[bold cyan]\[vidtok.modules.lpips][LPIPS][/bold cyan] loaded pretrained LPIPS loss from {}".format(ckpt)) + + def forward(self, input, target): + in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target)) + outs0, outs1 = self.net(in0_input), self.net(in1_input) + feats0, feats1, diffs = {}, {}, {} + lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4] + for kk in range(len(self.chns)): + feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) + diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 + + res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))] + val = res[0] + for l in range(1, len(self.chns)): + val += res[l] + return val + + +class ScalingLayer(nn.Module): + def __init__(self): + super(ScalingLayer, self).__init__() + self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]) + self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]) + + def forward(self, inp): + return (inp - self.shift) / self.scale + + +class NetLinLayer(nn.Module): + """A single linear layer which does a 1x1 conv""" + + def __init__(self, chn_in, chn_out=1, use_dropout=False): + super(NetLinLayer, self).__init__() + layers = ( + [ + nn.Dropout(), + ] + if (use_dropout) + else [] + ) + layers += [ + nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), + ] + self.model = nn.Sequential(*layers) + + +class vgg16(torch.nn.Module): + def __init__(self, requires_grad=False, pretrained=True): + super(vgg16, self).__init__() + vgg_pretrained_features = models.vgg16(pretrained=pretrained).features + self.slice1 = torch.nn.Sequential() + self.slice2 = torch.nn.Sequential() + self.slice3 = torch.nn.Sequential() + self.slice4 = torch.nn.Sequential() + self.slice5 = torch.nn.Sequential() + self.N_slices = 5 + for x in range(4): + self.slice1.add_module(str(x), vgg_pretrained_features[x]) + for x in range(4, 9): + self.slice2.add_module(str(x), vgg_pretrained_features[x]) + for x in range(9, 16): + self.slice3.add_module(str(x), vgg_pretrained_features[x]) + for x in range(16, 23): + self.slice4.add_module(str(x), vgg_pretrained_features[x]) + for x in range(23, 30): + self.slice5.add_module(str(x), vgg_pretrained_features[x]) + if not requires_grad: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, X): + h = self.slice1(X) + h_relu1_2 = h + h = self.slice2(h) + h_relu2_2 = h + h = self.slice3(h) + h_relu3_3 = h + h = self.slice4(h) + h_relu4_3 = h + h = self.slice5(h) + h_relu5_3 = h + vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) + out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) + return out + + +def normalize_tensor(x, eps=1e-10): + norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True)) + return x / (norm_factor + eps) + + +def spatial_average(x, keepdim=True): + return x.mean([2, 3], keepdim=keepdim) diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dcausal.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dcausal.py new file mode 100644 index 0000000000000000000000000000000000000000..c71a8b09e370451046eef9ba60315feab2459e35 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dcausal.py @@ -0,0 +1,885 @@ +from typing import Callable +from beartype import beartype +from beartype.typing import Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .util import checkpoint + + +def spatial_temporal_resblk(x, block_s, block_t, temb): + assert len(x.shape) == 5, "input should be 5D tensor, but got {}D tensor".format(len(x.shape)) + B, C, T, H, W = x.shape + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + x = block_s(x, temb) + x = einops.rearrange(x, "(b t) c h w -> b c t h w", b=B, t=T) + x = einops.rearrange(x, "b c t h w -> (b h w) c t") + x = block_t(x, temb) + x = einops.rearrange(x, "(b h w) c t -> b c t h w", b=B, h=H, w=W) + return x + + +def nonlinearity(x): + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32, norm_type="groupnorm"): + if norm_type == "groupnorm": + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + elif norm_type == "layernorm": + return LayerNorm(num_channels=in_channels, eps=1e-6) + + +def pad_at_dim(t, pad, dim=-1, pad_mode="constant", value=0.0): + assert pad_mode in ["constant", "replicate", "reflect"] + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + if pad_mode == "constant": + return F.pad(t, (*zeros, *pad), value=value) + return F.pad(t, (*zeros, *pad), mode=pad_mode) + + +def divisible_by(num, den): + return (num % den) == 0 + + +def is_odd(n): + return not divisible_by(n, 2) + + +def cast_tuple(t, length=1): + return t if isinstance(t, tuple) else ((t,) * length) + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class LayerNorm(nn.Module): + def __init__(self, num_channels, eps=1e-6, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True) + + def forward(self, x): + if x.dim() == 5: + x = rearrange(x, "b c t h w -> b t h w c") + x = self.norm(x) + x = rearrange(x, "b t h w c -> b c t h w") + elif x.dim() == 4: + x = rearrange(x, "b c h w -> b h w c") + x = self.norm(x) + x = rearrange(x, "b h w c -> b c h w") + else: + x = rearrange(x, "b c s -> b s c") + x = self.norm(x) + x = rearrange(x, "b s c -> b c s") + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__() + self.in_channels = in_channels + self.norm_type = norm_type + + self.norm = Normalize(in_channels, norm_type=self.norm_type) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x, **kwargs): + if self.use_checkpoint: + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x) + + def _forward(self, x, **kwargs): + h_ = x + h_ = self.attention(h_) + h_ = self.proj_out(h_) + return x + h_ + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + B = h_.shape[0] + h_ = rearrange(h_, "b c t h w -> (b t) c h w") + h_ = self.norm(h_) + h_ = rearrange(h_, "(b t) c h w -> b c t h w", b=B) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class CausalConv1d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: int, pad_mode="constant", **kwargs): + super().__init__() + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + self.pad_mode = pad_mode + self.time_pad = dilation * (kernel_size - 1) + (1 - stride) + self.time_causal_padding = (self.time_pad, 0) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant" + x = F.pad(x, self.time_causal_padding, mode=pad_mode) + return self.conv(x) + + +class CausalConv3d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", **kwargs): + super().__init__() + kernel_size = cast_tuple(kernel_size, 3) + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + dilation = cast_tuple(dilation, 3) + stride = cast_tuple(stride, 3) + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + assert is_odd(height_kernel_size) and is_odd(width_kernel_size) + + self.pad_mode = pad_mode + time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0]) + height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1]) + width_pad = dilation[2] * (height_kernel_size - 1) + (1 - stride[2]) + + self.time_pad = time_pad + self.time_causal_padding = ( + width_pad // 2, + width_pad - width_pad // 2, + height_pad // 2, + height_pad - height_pad // 2, + time_pad, + 0, + ) + + self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant" + + x = F.pad(x, self.time_causal_padding, mode=pad_mode) + return self.conv(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = CausalConv3d(in_channels, out_channels, 3, stride=(2, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 1, 0) + x1 = self.avg_pool(torch.nn.functional.pad(x, pad, mode="constant", value=0)) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.conv = CausalConv3d(in_channels, out_channels, 3) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode="nearest").to( + x.dtype + ) + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetCausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv3d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + else: + self.nin_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + ) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + return x + h + + +class ResnetCausalBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv1d(out_channels, out_channels, kernel_size=3, stride=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + else: + self.nin_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=1, stride=1) + + if zero_init: + self.conv2.conv.weight.data.zero_() + self.conv2.conv.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class EncoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_ds=None, + tempo_ds=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.norm_type = norm_type + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.is_causal = True + + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetCausalBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level in self.spatial_ds: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleResCausal2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls(block_in, norm_type=self.norm_type) + + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + ) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def forward(self, x): + temb = None + B, _, T, H, W = x.shape + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + + if i_level in self.spatial_ds: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal downsample + B, _, T, H, W = htmp.shape + if i_level in self.tempo_ds: + htmp = self.down_temporal[i_level].downsample(htmp) + + hs.append(htmp) + B, _, T, H, W = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = nonlinearity(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv_out(h) + + return h + + +class EncoderCausal3DPadding(EncoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.init_pad_mode = ignore_kwargs.get("init_pad_mode", "replicate") + self.time_padding = self.time_downsample_factor - 1 + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + video_len = x.shape[2] + if video_len % self.time_downsample_factor != 0: + x = pad_at_dim(x, (self.time_padding, 0), dim=2, pad_mode=self.init_pad_mode, value=0.0) + return super().forward(x) + + +class DecoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_us=None, + tempo_us=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.norm_type = norm_type + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us + self.tempo_us = [1, 2] if tempo_us is None else tempo_us + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.spatial_us: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetCausalBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleResCausal2x(block_in, block_in) + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def get_last_layer(self, **kwargs): + try: + return self.conv_out.conv.weight + except: + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, H, W = z.shape + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + + if i_level in self.spatial_us: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal upsample + B, _, T, H, W = h.shape + if i_level in self.tempo_us: + h = self.up_temporal[i_level].upsample(h) + B, _, T, H, W = h.shape + + # end + if self.give_pre_end: + return h + + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + + if self.tanh_out: + h = torch.tanh(h) + + return h + + +class DecoderCausal3DPadding(DecoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.time_padding = self.time_downsample_factor - 1 + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + x = super().forward(x) + return x[:, :, self.time_padding :, :, :] diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dcausal_v1_1.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dcausal_v1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..44397a135334a58cf5774fb899152f701d56a37d --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dcausal_v1_1.py @@ -0,0 +1,959 @@ +from typing import Callable +from beartype import beartype +from beartype.typing import Tuple, Union + +import einops +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .util import checkpoint + + +def spatial_temporal_resblk(x, block_s, block_t, temb): + assert len(x.shape) == 5, "input should be 5D tensor, but got {}D tensor".format(len(x.shape)) + B, C, T, H, W = x.shape + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + x = block_s(x, temb) + x = einops.rearrange(x, "(b t) c h w -> b c t h w", b=B, t=T) + x = einops.rearrange(x, "b c t h w -> (b h w) c t") + x = block_t(x, temb) + x = einops.rearrange(x, "(b h w) c t -> b c t h w", b=B, h=H, w=W) + return x + + +def nonlinearity(x): + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32, norm_type="groupnorm"): + if norm_type == "groupnorm": + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + elif norm_type == "layernorm": + return LayerNorm(num_channels=in_channels, eps=1e-6) + + +def pad_at_dim(t, pad, dim=-1, pad_mode="constant", value=0.0): + assert pad_mode in ["constant", "replicate", "reflect"] + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + if pad_mode == "constant": + return F.pad(t, (*zeros, *pad), value=value) + return F.pad(t, (*zeros, *pad), mode=pad_mode) + + +def divisible_by(num, den): + return (num % den) == 0 + + +def is_odd(n): + return not divisible_by(n, 2) + + +def cast_tuple(t, length=1): + return t if isinstance(t, tuple) else ((t,) * length) + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class LayerNorm(nn.Module): + def __init__(self, num_channels, eps=1e-6, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = torch.nn.LayerNorm(num_channels, eps=eps, elementwise_affine=True) + + def forward(self, x): + if x.dim() == 5: + x = rearrange(x, "b c t h w -> b t h w c") + x = self.norm(x) + x = rearrange(x, "b t h w c -> b c t h w") + elif x.dim() == 4: + x = rearrange(x, "b c h w -> b h w c") + x = self.norm(x) + x = rearrange(x, "b h w c -> b c h w") + else: + x = rearrange(x, "b c s -> b s c") + x = self.norm(x) + x = rearrange(x, "b s c -> b c s") + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__() + self.in_channels = in_channels + self.norm_type = norm_type + + self.norm = Normalize(in_channels, norm_type=self.norm_type) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x, **kwargs): + if self.use_checkpoint: + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x) + + def _forward(self, x, **kwargs): + h_ = x + h_ = self.attention(h_) + h_ = self.proj_out(h_) + return x + h_ + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + B = h_.shape[0] + h_ = rearrange(h_, "b c t h w -> (b t) c h w") + h_ = self.norm(h_) + h_ = rearrange(h_, "(b t) c h w -> b c t h w", b=B) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class CausalConv1d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: int, pad_mode="constant", **kwargs): + super().__init__() + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + self.pad_mode = pad_mode + self.time_pad = dilation * (kernel_size - 1) + (1 - stride) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + self.is_first_chunk = True + self.causal_cache = None + self.cache_offset = 0 + + def forward(self, x): + if self.is_first_chunk: + first_frame_pad = x[:, :, :1].repeat( + (1, 1, self.time_pad) + ) + else: + first_frame_pad = self.causal_cache + if self.time_pad != 0: + first_frame_pad = first_frame_pad[:, :, -self.time_pad:] + else: + first_frame_pad = first_frame_pad[:, :, 0:0] + + x = torch.concatenate((first_frame_pad, x), dim=2) + + if self.cache_offset == 0: + self.causal_cache = x.clone() + else: + self.causal_cache = x[:,:,:-self.cache_offset].clone() + + return self.conv(x) + + +class CausalConv3d(nn.Module): + @beartype + def __init__(self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], pad_mode="constant", **kwargs): + super().__init__() + kernel_size = cast_tuple(kernel_size, 3) + dilation = kwargs.pop("dilation", 1) + stride = kwargs.pop("stride", 1) + dilation = cast_tuple(dilation, 3) + stride = cast_tuple(stride, 3) + + time_kernel_size, height_kernel_size, width_kernel_size = kernel_size + + assert is_odd(height_kernel_size) and is_odd(width_kernel_size) + + self.pad_mode = pad_mode + time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0]) + height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1]) + width_pad = dilation[2] * (width_kernel_size - 1) + (1 - stride[2]) + + self.time_pad = time_pad + self.spatial_padding = ( + width_pad // 2, + width_pad - width_pad // 2, + height_pad // 2, + height_pad - height_pad // 2, + 0, + 0, + ) + + self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + self.is_first_chunk = True + self.causal_cache = None + self.cache_offset = 0 + + def forward(self, x): + if self.is_first_chunk: + first_frame_pad = x[:, :, :1, :, :].repeat( + (1, 1, self.time_pad, 1, 1) + ) + else: + first_frame_pad = self.causal_cache + if self.time_pad != 0: + first_frame_pad = first_frame_pad[:, :, -self.time_pad:] + else: + first_frame_pad = first_frame_pad[:, :, 0:0] + + x = torch.concatenate((first_frame_pad, x), dim=2) + + if self.cache_offset == 0: + self.causal_cache = x.clone() + else: + self.causal_cache = x[:,:,:-self.cache_offset].clone() + + x = F.pad(x, self.spatial_padding, mode=self.pad_mode) + return self.conv(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = CausalConv3d(in_channels, out_channels, 3, stride=(2, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + self.is_first_chunk = True + self.causal_cache = None + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 1, 0) + + if self.is_first_chunk: + x_pad = torch.nn.functional.pad(x, pad, mode="replicate") + else: + x_pad = torch.concatenate((self.causal_cache, x), dim=2) + + self.causal_cache = x_pad[:,:,-1:].clone() + + x1 = self.avg_pool(x_pad) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleResCausal2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + interpolation_mode='nearest', + num_temp_upsample=1 + ): + super().__init__() + self.conv = CausalConv3d(in_channels, out_channels, 3) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + self.interpolation_mode = interpolation_mode + self.num_temp_upsample = num_temp_upsample + self.enable_cached = (self.interpolation_mode == 'trilinear') + self.is_first_chunk = True + self.causal_cache = None + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + if not self.enable_cached: + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + elif not self.is_first_chunk: + x = torch.cat([self.causal_cache, x], dim=2) + self.causal_cache = x[:, :, -2*self.num_temp_upsample:-self.num_temp_upsample].clone() + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + x = x[:, :, 2*self.num_temp_upsample:] + else: + self.causal_cache = x[:, :, -self.num_temp_upsample:].clone() + x, _x = x[:, :, :self.num_temp_upsample], x[:, :, self.num_temp_upsample:] + x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(x.dtype) + if _x.shape[-3] > 0: + _x = F.interpolate(_x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(_x.dtype) + x = torch.concat([x, _x], dim=2) + + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetCausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv3d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + ) + else: + self.nin_shortcut = CausalConv3d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + ) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + return x + h + + +class ResnetCausalBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = CausalConv1d(out_channels, out_channels, kernel_size=3, stride=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=3, stride=1) + else: + self.nin_shortcut = CausalConv1d(in_channels, out_channels, kernel_size=1, stride=1) + + if zero_init: + self.conv2.conv.weight.data.zero_() + self.conv2.conv.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + B = x.shape[0] + h = x + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm1(h) + h = nonlinearity(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = rearrange(h, "(b s) c t -> (b t) c s", b=B) + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = rearrange(h, "(b t) c s -> (b s) c t", b=B) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class EncoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_ds=None, + tempo_ds=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.norm_type = norm_type + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.is_causal = True + + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetCausalBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level in self.spatial_ds: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleResCausal2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls(block_in, norm_type=self.norm_type) + + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + ) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def forward(self, x): + temb = None + B, _, T, H, W = x.shape + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + + if i_level in self.spatial_ds: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal downsample + B, _, T, H, W = htmp.shape + if i_level in self.tempo_ds: + htmp = self.down_temporal[i_level].downsample(htmp) + + hs.append(htmp) + B, _, T, H, W = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = nonlinearity(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = self.conv_out(h) + + return h + + +class EncoderCausal3DPadding(EncoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.init_pad_mode = ignore_kwargs.get("init_pad_mode", "replicate") + + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + video_len = x.shape[2] + if video_len % self.time_downsample_factor != 0: + time_padding = self.time_downsample_factor - video_len % self.time_downsample_factor + x = pad_at_dim(x, (time_padding, 0), dim=2, pad_mode=self.init_pad_mode, value=0.0) + return super().forward(x) + + +class DecoderCausal3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + spatial_us=None, + tempo_us=None, + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.norm_type = norm_type + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + self.interpolation_mode = ignorekwargs.get("interpolation_mode", 'nearest') + assert self.interpolation_mode in ['nearest', 'trilinear'] + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us + self.tempo_us = [1, 2] if tempo_us is None else tempo_us + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.spatial_us: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + num_temp_upsample = 1 + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetCausalBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleResCausal2x(block_in, block_in, interpolation_mode=self.interpolation_mode, num_temp_upsample=num_temp_upsample) + num_temp_upsample *= 2 + + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1) + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetCausalBlock + + def _make_conv(self) -> Callable: + return CausalConv3d + + def get_last_layer(self, **kwargs): + try: + return self.conv_out.conv.weight + except: + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, H, W = z.shape + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + + if i_level in self.spatial_us: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + + # temporal upsample + B, _, T, H, W = h.shape + if i_level in self.tempo_us: + h = self.up_temporal[i_level].upsample(h) + B, _, T, H, W = h.shape + + # end + if self.give_pre_end: + return h + + B, C, T, H, W = h.shape + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.norm_out(h) + h = rearrange(h, "(b t) c h w -> b c t h w", b=B) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + + if self.tanh_out: + h = torch.tanh(h) + + return h + + +class DecoderCausal3DPadding(DecoderCausal3D): + def __init__(self, *args, **ignore_kwargs): + super().__init__(*args, **ignore_kwargs) + + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def forward(self, x): + x = super().forward(x) + return x diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dnoncausal.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dnoncausal.py new file mode 100644 index 0000000000000000000000000000000000000000..4223fb635eb88cd7e7292943c131965f2b814206 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/model_3dnoncausal.py @@ -0,0 +1,652 @@ +from typing import Callable + +import einops +import torch +import torch.nn as nn +from einops import rearrange + +from .model_3dcausal import (AttnBlock, Normalize, nonlinearity, + spatial_temporal_resblk) +from .util import checkpoint + + +def make_attn(in_channels, use_checkpoint=False, norm_type="groupnorm"): + return AttnBlockWrapper(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + + +class AttnBlockWrapper(AttnBlock): + def __init__(self, in_channels, use_checkpoint=False, norm_type="groupnorm"): + super().__init__(in_channels, use_checkpoint=use_checkpoint, norm_type=norm_type) + self.q = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def attention(self, h_: torch.Tensor) -> torch.Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, t, h, w = q.shape + q, k, v = map(lambda x: rearrange(x, "b c t h w -> b t (h w) c").contiguous(), (q, k, v)) + h_ = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default + return rearrange(h_, "b t (h w) c -> b c t h w", h=h, w=w, c=c, b=b) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.in_channels = in_channels + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class TimeDownsampleRes2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.kernel_size = (3, 3, 3) + self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1)) + self.conv = nn.Conv3d(in_channels, out_channels, 3, stride=(2, 1, 1), padding=(0, 1, 1)) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + pad = (0, 0, 0, 0, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x1 = self.avg_pool(x) + x2 = self.conv(x) + return alpha * x1 + (1 - alpha) * x2 + + +class TimeUpsampleRes2x(nn.Module): + def __init__( + self, + in_channels, + out_channels, + mix_factor: float = 2.0, + ): + super().__init__() + self.conv = nn.Conv3d(in_channels, out_channels, 3, padding=1) + # https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/opensora/models/causalvideovae/model/modules/updownsample.py + self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor])) + + def forward(self, x): + alpha = torch.sigmoid(self.mix_factor) + xlst = [ + torch.nn.functional.interpolate( + sx.unsqueeze(0).to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode="nearest" + ).to(x.dtype) + for sx in x + ] + x = torch.cat(xlst, dim=0) + x_ = self.conv(x) + return alpha * x + (1 - alpha) * x_ + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetBlock1D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + zero_init=False, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + if zero_init: + self.conv2.weight.data.zero_() + self.conv2.bias.data.zero_() + + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class ResnetNoncausalBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + use_checkpoint=False, + norm_type="groupnorm", + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.norm_type = norm_type + + self.norm1 = Normalize(in_channels, norm_type=self.norm_type) + self.conv1 = nn.Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, norm_type=self.norm_type) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = nn.Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=1) + self.use_checkpoint = use_checkpoint + + def forward(self, x, temb): + if self.use_checkpoint: + assert temb is None, "checkpointing not supported with temb" + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + else: + return self._forward(x, temb) + + def _forward(self, x, temb=None): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class Encoder3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch=8, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels, + z_channels, + double_z=True, + norm_type="groupnorm", + **ignore_kwargs, + ): + super().__init__() + use_checkpoint = ignore_kwargs.get("use_checkpoint", False) + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.fix_encoder = ignore_kwargs.get("fix_encoder", False) + self.time_downsample_factor = ignore_kwargs.get("time_downsample_factor", 4) + self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] + self.spatial_ds = list(range(0, self.num_resolutions - 1)) # add for spatial tiling + self.norm_type = norm_type + self.is_causal = False + + # downsampling + make_conv_cls = self._make_conv() + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + + self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + self.down_temporal = nn.ModuleList() + for i_level in range(self.num_resolutions): + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + + block = nn.ModuleList() + attn = nn.ModuleList() + block_temporal = nn.ModuleList() + attn_temporal = nn.ModuleList() + + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_temporal.append( + ResnetBlock1D( + in_channels=block_out, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + + down = nn.Module() + down.block = block + down.attn = attn + + down_temporal = nn.Module() + down_temporal.block = block_temporal + down_temporal.attn = attn_temporal + + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + if i_level in self.tempo_ds: + down_temporal.downsample = TimeDownsampleRes2x(block_in, block_in) + + self.down.append(down) + self.down_temporal.append(down_temporal) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn(block_in, norm_type=self.norm_type) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + if self.fix_encoder: + for param in self.parameters(): + param.requires_grad = False + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetNoncausalBlock + + def _make_conv(self) -> Callable: + return nn.Conv3d + + def forward(self, x): + temb = None + B, _, T, _, _ = x.shape + + # downsampling + if x.shape[1] == 4 and self.conv_in.in_channels == 3: + raise ValueError("Mismatched number of input channels") + hs = [self.conv_in(x)] + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = spatial_temporal_resblk( + hs[-1], self.down[i_level].block[i_block], self.down_temporal[i_level].block[i_block], temb + ) + hs.append(h) + if i_level != self.num_resolutions - 1: + # spatial downsample + htmp = einops.rearrange(hs[-1], "b c t h w -> (b t) c h w") + htmp = self.down[i_level].downsample(htmp) + htmp = einops.rearrange(htmp, "(b t) c h w -> b c t h w", b=B, t=T) + if i_level in self.tempo_ds: + # temporal downsample + htmp = self.down_temporal[i_level].downsample(htmp) + hs.append(htmp) + B, _, T, _, _ = htmp.shape + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder3D(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + dropout=0.0, + resamp_with_conv=True, + in_channels=8, + z_channels, + give_pre_end=False, + tanh_out=False, + norm_type="groupnorm", + **ignorekwargs, + ): + super().__init__() + use_checkpoint = ignorekwargs.get("use_checkpoint", False) + + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + self.fix_decoder = ignorekwargs.get("fix_decoder", False) + self.tempo_us = [1, 2] + self.norm_type = norm_type + + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + + make_attn_cls = self._make_attn() + make_resblock_cls = self._make_resblock() + make_conv_cls = self._make_conv() + self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + self.mid.attn_1 = make_attn_cls( + block_in, use_checkpoint=use_checkpoint, norm_type=self.norm_type + ) + self.mid.block_2 = make_resblock_cls( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + self.up.insert(0, up) + + self.up_temporal = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock1D( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + zero_init=True, + use_checkpoint=use_checkpoint, + norm_type=self.norm_type, + ) + ) + block_in = block_out + up_temporal = nn.Module() + up_temporal.block = block + up_temporal.attn = attn + if i_level in self.tempo_us: + up_temporal.upsample = TimeUpsampleRes2x(block_in, block_in) + + self.up_temporal.insert(0, up_temporal) + + # end + self.norm_out = Normalize(block_in, norm_type=self.norm_type) + self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1, padding=1) + + if self.fix_decoder: + for param in self.parameters(): + param.requires_grad = False + + def _make_attn(self) -> Callable: + return make_attn + + def _make_resblock(self) -> Callable: + return ResnetNoncausalBlock + + def _make_conv(self) -> Callable: + return nn.Conv3d + + def get_last_layer(self, **kwargs): + return self.conv_out.weight + + def forward(self, z, **kwargs): + temb = None + B, _, T, _, _ = z.shape + + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb, **kwargs) + h = self.mid.attn_1(h, **kwargs) + h = self.mid.block_2(h, temb, **kwargs) + + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = spatial_temporal_resblk( + h, self.up[i_level].block[i_block], self.up_temporal[i_level].block[i_block], temb + ) + if i_level != 0: + # spatial upsample + h = einops.rearrange(h, "b c t h w -> (b t) c h w") + h = self.up[i_level].upsample(h) + h = einops.rearrange(h, "(b t) c h w -> b c t h w", b=B, t=T) + if i_level in self.tempo_us: + # temporal upsample + h = self.up_temporal[i_level].upsample(h) + B, _, T, _, _ = h.shape + # end + if self.give_pre_end: + return h + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h, **kwargs) + if self.tanh_out: + h = torch.tanh(h) + return h diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/regularizers.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/regularizers.py new file mode 100644 index 0000000000000000000000000000000000000000..4f4f1a4fccdb873f640e637ecbaaaef389e75b41 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/regularizers.py @@ -0,0 +1,268 @@ +from abc import abstractmethod +from functools import cache +from typing import Any, List, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from einops import pack, rearrange, reduce, unpack +from torch import Tensor, int32 +from torch.cuda.amp import autocast + +from .distributions import DiagonalGaussianDistribution + + +def exists(v): + return v is not None + + +def default(*args): + for arg in args: + if exists(arg): + return arg + return None + + +def pack_one(t, pattern): + return pack([t], pattern) + + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + + +def round_ste(z: Tensor) -> Tensor: + """Round with straight through gradients.""" + zhat = z.round() + return z + (zhat - z).detach() + + +def log(t, eps=1e-5): + return t.clamp(min=eps).log() + + +def entropy(prob): + return (-prob * log(prob)).sum(dim=-1) + + +def maybe_distributed_mean(t): + if not is_distributed(): + return t + dist.all_reduce(t) + t = t / dist.get_world_size() + return t + + +@cache +def is_distributed(): + return dist.is_initialized() and dist.get_world_size() > 1 + + +class AbstractRegularizer(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]: + raise NotImplementedError() + + @abstractmethod + def get_trainable_parameters(self) -> Any: + raise NotImplementedError() + + +class DiagonalGaussianRegularizer(AbstractRegularizer): + def __init__(self, sample: bool = True): + super().__init__() + self.sample = sample + + def get_trainable_parameters(self) -> Any: + yield from () + + def forward(self, z: torch.Tensor, n_steps=None) -> Tuple[torch.Tensor, dict]: + log = dict() + posterior = DiagonalGaussianDistribution(z) + if self.sample: + z = posterior.sample() + else: + z = posterior.mode() + kl_loss = posterior.kl() + kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] + log["kl_loss"] = kl_loss + return z, log + + +class FSQRegularizer(AbstractRegularizer): + # https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/finite_scalar_quantization.py + def __init__( + self, + levels: List[int], + dim: Optional[int] = None, + num_codebooks=1, + keep_num_codebooks_dim: Optional[bool] = None, + scale: Optional[float] = None, + entropy_loss_weight: float = 0.0, + entropy_loss_annealing_steps: int = 0, + entropy_loss_annealing_factor: float = 1.0, + commitment_loss_weight: float = 0.0, + diversity_gamma: float = 1.0, + ): + super().__init__() + _levels = torch.tensor(levels, dtype=int32) + self.register_buffer("_levels", _levels, persistent=False) + + _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32) + self.register_buffer("_basis", _basis, persistent=False) + + self.scale = scale + self.entropy_loss_weight = entropy_loss_weight + self.entropy_loss_annealing_steps = entropy_loss_annealing_steps + self.entropy_loss_annealing_factor = entropy_loss_annealing_factor + self.commitment_loss_weight = commitment_loss_weight + self.diversity_gamma = diversity_gamma + + codebook_dim = len(levels) + self.codebook_dim = codebook_dim + + effective_codebook_dim = codebook_dim * num_codebooks + self.num_codebooks = num_codebooks + self.effective_codebook_dim = effective_codebook_dim + + keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1) + assert not (num_codebooks > 1 and not keep_num_codebooks_dim) + self.keep_num_codebooks_dim = keep_num_codebooks_dim + + self.dim = default(dim, len(_levels) * num_codebooks) + + has_projections = self.dim != effective_codebook_dim + self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity() + self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity() + self.has_projections = has_projections + + self.codebook_size = self._levels.prod().item() + + implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out=False) + self.register_buffer("implicit_codebook", implicit_codebook, persistent=False) + self.register_buffer("zero", torch.tensor(0.0), persistent=False) + + self.global_codebook_usage = torch.zeros([2**self.codebook_dim, self.num_codebooks], dtype=torch.long) + + def get_trainable_parameters(self) -> Any: + return self.parameters() + + def bound(self, z: Tensor, eps: float = 1e-3) -> Tensor: + """Bound `z`, an array of shape (..., d).""" + half_l = (self._levels - 1) * (1 + eps) / 2 + offset = torch.where(self._levels % 2 == 0, 0.5, 0.0) + shift = (offset / half_l).atanh() + return (z + shift).tanh() * half_l - offset + + def quantize(self, z: Tensor) -> Tensor: + """Quantizes z, returns quantized zhat, same shape as z.""" + quantized = round_ste(self.bound(z)) + half_width = self._levels // 2 + return quantized / half_width + + def _scale_and_shift(self, zhat_normalized: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat_normalized * half_width) + half_width + + def _scale_and_shift_inverse(self, zhat: Tensor) -> Tensor: + half_width = self._levels // 2 + return (zhat - half_width) / half_width + + def codes_to_indices(self, zhat: Tensor) -> Tensor: + """Converts a `code` to an index in the codebook.""" + assert zhat.shape[-1] == self.codebook_dim + zhat = self._scale_and_shift(zhat) + return (zhat * self._basis).sum(dim=-1).to(int32) + + def indices_to_codes(self, indices: Tensor, project_out=True) -> Tensor: + """Inverse of `codes_to_indices`.""" + + is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim)) + + indices = rearrange(indices, "... -> ... 1") + codes_non_centered = (indices // self._basis) % self._levels + codes = self._scale_and_shift_inverse(codes_non_centered) + + if self.keep_num_codebooks_dim: + codes = rearrange(codes, "... c d -> ... (c d)") + + if project_out: + codes = self.project_out(codes) + + if is_img_or_video: + codes = rearrange(codes, "b ... d -> b d ...") + + return codes + + def calculate_entropy_loss_weight(self, n_steps): + if n_steps >= self.entropy_loss_annealing_steps: + return self.entropy_loss_weight + start = self.entropy_loss_annealing_factor * self.entropy_loss_weight + return start - (n_steps / self.entropy_loss_annealing_steps) * (start - self.entropy_loss_weight) + + @autocast(enabled=False) + def forward(self, z: Tensor, inv_temperature: float = 100.0, n_steps: int = 0) -> Tensor: + """ + einstein notation + b - batch + n - sequence (or flattened spatial dimensions) + d - feature dimension + c - number of codebook dim + """ + is_img_or_video = z.ndim >= 4 + if is_img_or_video: + z = rearrange(z, "b d ... -> b ... d") + z, ps = pack_one(z, "b * d") + + assert z.shape[-1] == self.dim, f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}" + + z = self.project_in(z) + z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks) + + with torch.autocast("cuda", enabled=False): + orig_dtype = z.dtype + z = z.float() + original_input = z + codes = self.quantize(z) + indices = self.codes_to_indices(codes) + + if self.entropy_loss_weight > 0 or self.commitment_loss_weight > 0: + # the same as euclidean distance up to a constant + distance = -2 * torch.einsum("... i d, j d -> ... i j", original_input, self.implicit_codebook) + prob = (-distance * inv_temperature).softmax(dim=-1) + per_sample_probs = rearrange(prob, "b n ... -> (b n) ...") + per_sample_entropy = entropy(per_sample_probs).mean() + # distribution over all available tokens in the batch + avg_prob = reduce(per_sample_probs, "... c d -> c d", "mean") + avg_prob = maybe_distributed_mean(avg_prob) + codebook_entropy = entropy(avg_prob).mean() + entropy_aux_loss = per_sample_entropy - self.diversity_gamma * codebook_entropy + # commit loss + commit_loss = F.mse_loss(original_input, codes.detach(), reduction="none") + commit_loss = commit_loss.mean() + else: + entropy_aux_loss = per_sample_entropy = codebook_entropy = commit_loss = self.zero + + codes = codes.type(orig_dtype) + + codes = rearrange(codes, "b n c d -> b n (c d)") + out = self.project_out(codes) + + # reconstitute image or video dimensions + if is_img_or_video: + out = unpack_one(out, ps, "b * d") + out = rearrange(out, "b ... d -> b d ...") + + indices = unpack_one(indices, ps, "b * c") + + if not self.keep_num_codebooks_dim: + indices = rearrange(indices, "... 1 -> ...") + + aux_loss = ( + entropy_aux_loss * self.calculate_entropy_loss_weight(n_steps) + commit_loss * self.commitment_loss_weight + ) + + return out, dict(indices=indices, aux_loss=aux_loss) diff --git a/Meissonic/vidtok_cache/VidTok/vidtok/modules/util.py b/Meissonic/vidtok_cache/VidTok/vidtok/modules/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9570016221e47b50d536d30352b5718a29bb0009 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtok/modules/util.py @@ -0,0 +1,324 @@ +import importlib +import random +import os +import einops +import numpy as np +from inspect import isfunction +from rich import print +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from lightning.pytorch.utilities.rank_zero import rank_zero_only + + +def get_valid_dirs(dir1: str, dir2: str, dir3: Union[None, str] = None) -> Union[None, str]: + if (dir1 is not None) and os.path.isdir(dir1): + return dir1 + elif (dir2 is not None) and os.path.isdir(dir2): + return dir2 + elif (dir3 is not None) and os.path.isdir(dir3): + return dir3 + else: + return None + + +def get_valid_paths(path1: str, path2: str, path3: Union[None, str] = None) -> Union[None, str]: + if (path1 is not None) and os.path.isfile(path1): + return path1 + elif (path2 is not None) and os.path.isfile(path2): + return path2 + elif (path3 is not None) and os.path.isfile(path3): + return path3 + else: + return None + + +@rank_zero_only +def print0(*args, **kwargs): + print(*args, **kwargs) + + +def seed_anything(seed: int): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def isheatmap(x): + if not isinstance(x, torch.Tensor): + return False + + return x.ndim == 2 + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def instantiate_from_config(config): + if not "target" in config: + if config == "__is_first_stage__": + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + + +def get_obj_from_str(string, reload=False, invalidate_cache=True): + module, cls = string.rsplit(".", 1) + if invalidate_cache: + importlib.invalidate_caches() + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def checkpoint(func, inputs, params, flag): + # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(torch.autograd.Function): + # https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + ctx.gpu_autocast_kwargs = { + "enabled": torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled(), + } + with torch.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] + # Ensure all tensors have requires_grad set to True + ctx.input_params = [p.requires_grad_(True) for p in ctx.input_params] + with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = torch.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads + + +def compute_psnr(x, y): + if x.dim() == 5: + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + assert y.dim() == 5 + y = einops.rearrange(y, "b c t h w -> (b t) c h w") + EPS = 1e-8 + mse = torch.mean((x - y) ** 2, dim=[1, 2, 3]) + psnr = -10 * torch.log10(mse + EPS) + return psnr.mean(dim=0) + + +def compute_ssim(x, y): + if x.dim() == 5: + x = einops.rearrange(x, "b c t h w -> (b t) c h w") + assert y.dim() == 5 + y = einops.rearrange(y, "b c t h w -> (b t) c h w") + kernel_size = 11 + kernel_sigma = 1.5 + k1 = 0.01 + k2 = 0.03 + + f = max(1, round(min(x.size()[-2:]) / 256)) + if f > 1: + x = F.avg_pool2d(x, kernel_size=f) + y = F.avg_pool2d(y, kernel_size=f) + + kernel = gaussian_filter(kernel_size, kernel_sigma, device=x.device, dtype=x.dtype).repeat(x.size(1), 1, 1, 1) + + _compute_ssim_per_channel = _ssim_per_channel_complex if x.dim() == 5 else _ssim_per_channel + ssim_map, cs_map = _compute_ssim_per_channel(x=x, y=y, kernel=kernel, data_range=1, k1=k1, k2=k2) + ssim_val = ssim_map.mean(1) + + return ssim_val.mean(dim=0) + + +def _ssim_per_channel( + x: torch.Tensor, + y: torch.Tensor, + kernel: torch.Tensor, + data_range: Union[float, int] = 1.0, + k1: float = 0.01, + k2: float = 0.03, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Calculate Structural Similarity (SSIM) index for X and Y per channel. + + Args: + x: An input tensor. Shape :math:`(N, C, H, W)`. + y: A target tensor. Shape :math:`(N, C, H, W)`. + kernel: 2D Gaussian kernel. + data_range: Maximum value range of images (usually 1.0 or 255). + k1: Algorithm parameter, K1 (small constant, see [1]). + k2: Algorithm parameter, K2 (small constant, see [1]). + Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results. + + Returns: + Full Value of Structural Similarity (SSIM) index. + """ + if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): + raise ValueError( + f"Kernel size can't be greater than actual input size. " + f"Input size: {x.size()}. Kernel size: {kernel.size()}" + ) + + c1 = k1**2 + c2 = k2**2 + n_channels = x.size(1) + mu_x = F.conv2d(x, weight=kernel, stride=1, padding=0, groups=n_channels) + mu_y = F.conv2d(y, weight=kernel, stride=1, padding=0, groups=n_channels) + + mu_xx = mu_x**2 + mu_yy = mu_y**2 + mu_xy = mu_x * mu_y + + sigma_xx = F.conv2d(x**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xx + sigma_yy = F.conv2d(y**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_yy + sigma_xy = F.conv2d(x * y, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xy + + # Contrast sensitivity (CS) with alpha = beta = gamma = 1. + cs = (2.0 * sigma_xy + c2) / (sigma_xx + sigma_yy + c2) + + # Structural similarity (SSIM) + ss = (2.0 * mu_xy + c1) / (mu_xx + mu_yy + c1) * cs + + ssim_val = ss.mean(dim=(-1, -2)) + cs = cs.mean(dim=(-1, -2)) + return ssim_val, cs + + +def _ssim_per_channel_complex( + x: torch.Tensor, + y: torch.Tensor, + kernel: torch.Tensor, + data_range: Union[float, int] = 1.0, + k1: float = 0.01, + k2: float = 0.03, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel. + + Args: + x: An input tensor. Shape :math:`(N, C, H, W, 2)`. + y: A target tensor. Shape :math:`(N, C, H, W, 2)`. + kernel: 2-D gauss kernel. + data_range: Maximum value range of images (usually 1.0 or 255). + k1: Algorithm parameter, K1 (small constant, see [1]). + k2: Algorithm parameter, K2 (small constant, see [1]). + Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results. + + Returns: + Full Value of Complex Structural Similarity (SSIM) index. + """ + n_channels = x.size(1) + if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): + raise ValueError( + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " + f"Kernel size: {kernel.size()}" + ) + + c1 = k1**2 + c2 = k2**2 + + x_real = x[..., 0] + x_imag = x[..., 1] + y_real = y[..., 0] + y_imag = y[..., 1] + + mu1_real = F.conv2d(x_real, weight=kernel, stride=1, padding=0, groups=n_channels) + mu1_imag = F.conv2d(x_imag, weight=kernel, stride=1, padding=0, groups=n_channels) + mu2_real = F.conv2d(y_real, weight=kernel, stride=1, padding=0, groups=n_channels) + mu2_imag = F.conv2d(y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) + + mu1_sq = mu1_real.pow(2) + mu1_imag.pow(2) + mu2_sq = mu2_real.pow(2) + mu2_imag.pow(2) + mu1_mu2_real = mu1_real * mu2_real - mu1_imag * mu2_imag + mu1_mu2_imag = mu1_real * mu2_imag + mu1_imag * mu2_real + + compensation = 1.0 + + x_sq = x_real.pow(2) + x_imag.pow(2) + y_sq = y_real.pow(2) + y_imag.pow(2) + x_y_real = x_real * y_real - x_imag * y_imag + x_y_imag = x_real * y_imag + x_imag * y_real + + sigma1_sq = F.conv2d(x_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_sq + sigma2_sq = F.conv2d(y_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu2_sq + sigma12_real = F.conv2d(x_y_real, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_real + sigma12_imag = F.conv2d(x_y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_imag + sigma12 = torch.stack((sigma12_imag, sigma12_real), dim=-1) + mu1_mu2 = torch.stack((mu1_mu2_real, mu1_mu2_imag), dim=-1) + # Set alpha = beta = gamma = 1. + cs_map = (sigma12 * 2 + c2 * compensation) / (sigma1_sq.unsqueeze(-1) + sigma2_sq.unsqueeze(-1) + c2 * compensation) + ssim_map = (mu1_mu2 * 2 + c1 * compensation) / (mu1_sq.unsqueeze(-1) + mu2_sq.unsqueeze(-1) + c1 * compensation) + ssim_map = ssim_map * cs_map + + ssim_val = ssim_map.mean(dim=(-2, -3)) + cs = cs_map.mean(dim=(-2, -3)) + + return ssim_val, cs + + +def gaussian_filter( + kernel_size: int, sigma: float, device: Optional[str] = None, dtype: Optional[type] = None +) -> torch.Tensor: + r"""Returns 2D Gaussian kernel N(0,`sigma`^2) + Args: + kernel_size: Size of the kernel + sigma: Std of the distribution + device: target device for kernel generation + dtype: target data type for kernel generation + Returns: + gaussian_kernel: Tensor with shape (1, kernel_size, kernel_size) + """ + coords = torch.arange(kernel_size, dtype=dtype, device=device) + coords -= (kernel_size - 1) / 2.0 + + g = coords**2 + g = (-(g.unsqueeze(0) + g.unsqueeze(1)) / (2 * sigma**2)).exp() + + g /= g.sum() + return g.unsqueeze(0) diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/README.md b/Meissonic/vidtok_cache/VidTok/vidtwin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e475c6eade763f9cef3b67a905dd738e1fb43fef --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/README.md @@ -0,0 +1,211 @@ + + +

+VidTwin VidTwin: Video VAE with Decoupled Structure and Dynamics (CVPR 2025) +

+

+ Yuchi Wang   + Junliang Guo   + Xinyi Xie   + Tianyu He   + Xu Sun   + Jiang Bian +

+ +
+ +
+ +[![arXiv](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2412.17726)   [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/microsoft/vidtwin)   [![Static Badge](https://img.shields.io/badge/Demo-Project_Page-yellow)](https://vidtwin.github.io/) + +

🔥 Check our Demo Page for enhanced visual experience.

+ + +
+
+
+ +We propose a novel and compact video autoencoder, VidTwin, that decouples video into two distinct latent spaces: **Structure latent vectors**, which capture overall content and global movement, and **Dynamics latent vectors**, which represent fine-grained details and rapid movements. + +Extensive experiments show that VidTwin achieves a high compression rate of 0.20% with high reconstruction quality (PSNR of 28.14 on the MCL-JCV dataset), and performs efficiently and effectively in downstream generative tasks. Moreover, our model demonstrates explainability and scalability, paving the way for future research in video latent representation and generation. +
+
+ +
+ +## Setup + +1. Our code is based on **VidTok**, so you will need to install the [required packages for VidTok](https://github.com/microsoft/VidTok?tab=readme-ov-file#setup) first. To do so, navigate to the VidTok folder and create the environment using the `environment.yaml` file: + +```bash +cd VidTok +# Prepare conda environment +conda env create -f environment.yaml +# Activate the environment +conda activate vidtok +``` + +2. After setting up VidTok, install the additional packages required for the VidTwin model: +```bash +pip install tranformers +pip install timm +pip install flash-attn --no-build-isolation +``` + + +## Training + +### Data Preparation + +We follow the same approach as **VidTok** to prepare the data. You can also find the Dataloader class in: `vidtok/data/vidtok.py`. This Dataloader is a general-purpose class for handling video data. You may customize it to suit your own dataset and specific use cases. + +1. Put all training videos under `DATA_DIR`: +``` +└── DATA_DIR + ├── subset1 + │ ├── videoname11.mp4 + │ └── videoname12.mp4 + ├── subset2 + │ ├── videoname21.mp4 + │ ├── videoname22.mp4 + │ └── subsubset1 + │ ├── videoname211.mp4 + │ └── videoname212.mp4 + └── ... +``` +2. Prepare a `.csv` meta file to record the relative paths of these videos with respect to `DATA_DIR`, like: +``` +videos +subset1/videoname11.mp4 +subset2/videoname21.mp4 +subset2/subsubset1/videoname211.mp4 +``` + +> Validation data is also prepared following the above steps. + +### Launch Training + +1. Specify the Configuration File + +Our code follows a **modular design**, allowing you to easily customize the model structure and training settings by modifying a configuration file. For the **VidTwin** model, we provide the following configuration file:`configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml`. + +- In the **Model** section of the configuration file, you can specify the model's structure and key hyperparameters. For instance, you can adjust the following settings: + +```yaml +model: + params: + expect_ch: 8 # the dimension of the Structure Latent, d_S + cont_num_blocks: 1 # downsample blocks of the Structure Latent, 1 -> h_S = 7, 2 -> h_S = 4, 3 -> h_S = 2 + downsample_motion: True + motion_num_blocks: 1 # downsample blocks of the Dynamics Latent, 1 -> h_D = 7, 2 -> h_D = 4, 3 -> h_D = 2 + d_dim: 8 # the dimension of the Dynamics Latent, d_D +``` + +- If you'd like to **fine-tune** the model from a pre-trained checkpoint instead of training from scratch, you can specify the `ckpt_path` parameter in the configuration file. + +```yaml +model: + params: + ckpt_path: PATH_TO_CHECKPOINT # train from existing checkpoint +``` + +- In the **Data** section of the configuration file, you can specify paths and other important data-related hyperparameters. + +```yaml +train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 # 224 for our VidTwin model + input_width: INPUT_WIDTH_1 # 224 for our VidTwin model + sample_num_frames: NUM_FRAMES_1 # set to 16 for our VidTwin model + sample_fps: SAMPLE_FPS_1 # sample fps for training data, 8 for VidTwin model +validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 # 224 for our VidTwin model + input_width: INPUT_WIDTH_2 # 224 for our VidTwin model + sample_num_frames: NUM_FRAMES_2 # set to 16 for our VidTwin model + sample_fps: SAMPLE_FPS_2 # sample fps for validation data + start_index: 0 # fixed value to ensure the same sampled data +``` + +2. Run the following command to start training: +```bash +python main.py -b CONFIG --logdir LOGDIR + +# You can also use `torchrun` to start the training code. +``` +Training logs and checkpoints are saved in `LOGDIR`. + +It is recommended to use [Weights & Biases](https://wandb.ai/site) as the data visualization tool ([TensorBoard](https://www.tensorflow.org/tensorboard) by default). Use `wandb login` to log in first, and then run: +``` +python main.py -b CONFIG --logdir LOGDIR --wandb --wandb_entity ENTITY --wandb_project PROJECT +``` + +## Inference + + +### Easy Usage +We provide the following example for a quick usage of our models. After downloaded the checkpoint from our [Huggingface page](https://huggingface.co/microsoft/vidtwin), just provide the path of the configuration file `cfg_path` and checkpoint file `ckpt_path` to the script. +```python +import torch +from scripts.inference_evaluate import load_model_from_config + +cfg_path = "configs/vidtwin/vidtwin_structure_7_7_8_dynamics_7_8.yaml" +ckpt_path = "checkpoints/vidtwin_structure_7_7_8_dynamics_7_8.ckpt" + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +# load pre-trained model +model = load_model_from_config(cfg_path, ckpt_path) +model.to(device).eval() +# random input +num_frames = 16 +x_input = (torch.rand(1, 3, num_frames, 224, 224) * 2 - 1).to(device) # [B, C, T, H, W], range -1~1 +# model forward +_, x_recon, *_ = model(x_input) +assert x_input.shape == x_recon.shape +``` + +### Reconstruct an Input Video +```bash +python vidtwin/scripts/inference_reconstruct.py --config CONFIG --ckpt CKPT --input_video_path VIDEO_PATH --num_frames_per_batch NUM_FRAMES_PER_BATCH --input_height 224 --input_width 224 --sample_fps 25 --output_video_dir OUTPUT_DIR +``` +- Specify `VIDEO_PATH` to the path of your test video. We provide an example video in `assets/example.mp4`. +- Set `NUM_FRAMES_PER_BATCH` to `16. +- The reconstructed video is saved in `OUTPUT_DIR`. + +### Performance Evaluation +We also provide a manuscript `scripts/inference_evaluate.py` to evaluate the video reconstruction performance in PSNR, SSIM and LPIPS. + +1. Put all of your test videos under `DATA_DIR`. +2. Run the following command, and all `.mp4` videos under `DATA_DIR` will be tested: +```bash +python vidtwin/scripts/inference_evaluate.py --config CONFIG --ckpt CKPT --data_dir DATA_DIR --num_frames_per_batch NUM_FRAMES_PER_BATCH --input_height 224 --input_width 224 --sample_fps 25 +``` +(Optional) If you only want to test certain videos under `DATA_DIR`, you need to prepare a `.csv` meta file +to indicate the video files to be tested (refer to [Data Preparation](#data-preparation)). And add `--meta_path META_PATH` to the above command to specify the path to the `.csv` meta file. + + + +### Cross-reenactment Reconstruction + +For VidTwin model, we conduct a cross-reenactment experiment in which we combine the *Structure Latent* from one video, $A$, with the *Dynamics Latent* from another video, $B$, to observe the generated output from the decoder, i.e., generating $\mathcal{D}(u^A_{\boldsymbol{S}}, u^B_{\boldsymbol{D}})$. + +To facilitate this experiment, we provide the script `vidtwin/scripts/inference_vidtwin_cross_reconstruct.py`. This script follows a similar usage method to `vidtwin/scripts/inference_reconstruct.py` with the addition of two new arguments: `--input_video_path_structure` and `--input_video_path_dynamics`, which allow you to specify the videos for structure and dynamics information, respectively. + +## BibTeX +If you find our project helpful to your research, please consider starring this repository🌟 and citing our paper. +```bibtex +@article{wang2024vidtwin, + title={VidTwin: Video VAE with Decoupled Structure and Dynamics}, + author={Wang, Yuchi and Guo, Junliang and Xie, Xinyi and He, Tianyu and Sun, Xu and Bian, Jiang}, + year={2024}, + journal={arXiv preprint arXiv:2412.17726}, +} +``` diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/models/vidtwin_ae.py b/Meissonic/vidtok_cache/VidTok/vidtwin/models/vidtwin_ae.py new file mode 100644 index 0000000000000000000000000000000000000000..22daf6cae27b2731021490adf8a4b8cb41dc5a4b --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/models/vidtwin_ae.py @@ -0,0 +1,1604 @@ +import os +import re +import math +from abc import abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Tuple, Union + +import lightning.pytorch as pl +import torch +import einops +from omegaconf import ListConfig +from packaging import version +from safetensors.torch import load_file as load_safetensors + +from torch.optim.lr_scheduler import _LRScheduler, LambdaLR, StepLR +from vidtok.modules.util import default, instantiate_from_config, print0, get_valid_paths +from vidtok.modules.util import compute_psnr, compute_ssim +from vidtok.models.autoencoder import AbstractAutoencoder +import numpy as np +from torch import nn +from einops import rearrange, repeat +import transformers + + +class VidAutoEncoderQformerBase(AbstractAutoencoder): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def init_from_ckpt( + self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple() + ) -> None: + if path.endswith("ckpt"): + # sd = torch.load(path, map_location="cpu")["state_dict"] + ckpt = torch.load(path, map_location="cpu") + if "state_dict" in ckpt: + sd = ckpt["state_dict"] + else: + sd = ckpt + elif path.endswith("safetensors"): + sd = load_safetensors(path) + else: + raise NotImplementedError + + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if re.match(ik, k): + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Deleting key {k} from state_dict.") + del sd[k] + + for k, tensor in sd.items(): + sd[k] = tensor.to(torch.float64) + + missing, unexpected = self.load_state_dict(sd, strict=False) + print0( + f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Unexpected Keys: {unexpected}") + + def get_input(self, batch: Dict) -> torch.Tensor: + # assuming unified data format, dataloader returns a dict. + # image tensors should be scaled to -1 ... 1 and in channels-first format (e.g., bchw instead if bhwc) + return batch[self.input_key] + + def get_autoencoder_params(self) -> list: + params = ( + list(self.encoder.parameters()) + + list(self.decoder.parameters()) + + list(self.get_disentangle_params()) + + list(self.regularization.get_trainable_parameters()) + + list(self.loss.get_trainable_autoencoder_parameters()) + ) + return params + + def get_discriminator_params(self) -> list: + params = list(self.loss.get_trainable_parameters()) # e.g., discriminator + return params + + def get_last_layer(self): + return self.decoder.get_last_layer() + + # See https://github.com/Lightning-AI/pytorch-lightning/issues/17801 and https://lightning.ai/docs/pytorch/stable/common/optimization.html for the reason of this change + def training_step(self, batch, batch_idx) -> Any: + x = self.get_input(batch) + z, xrec, regularization_log, *_ = self(x) + opt_g, opt_d = self.optimizers() + sch1, sch2 = self.lr_schedulers() + + + # autoencode loss + self.toggle_optimizer(opt_g) + # adversarial loss is binary cross-entropy + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_g.zero_grad() + self.manual_backward(aeloss) + opt_g.step() + sch1.step() + self.untoggle_optimizer(opt_g) + + # discriminator loss + self.toggle_optimizer(opt_d) + # adversarial loss is binary cross-entropy + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="train", + ) + opt_d.zero_grad() + self.manual_backward(discloss) + opt_d.step() + + sch2.step() + self.untoggle_optimizer(opt_d) + + # logging + log_dict = { + "train/aeloss": aeloss, + "train/discloss": discloss, + } + log_dict.update(log_dict_ae) + log_dict.update(log_dict_disc) + self.log_dict(log_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + + def validation_step(self, batch, batch_idx) -> Dict: + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + log_dict.update(log_dict_ema) + return log_dict + + def _validation_step(self, batch, batch_idx, postfix="") -> Dict: + x = self.get_input(batch) + + z, xrec, regularization_log, *_ = self(x) + aeloss, log_dict_ae = self.loss( + regularization_log, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + + discloss, log_dict_disc = self.loss( + regularization_log, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split="val" + postfix, + ) + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + log_dict_ae.update(log_dict_disc) + self.log_dict(log_dict_ae) + + # evaluate the psnr and ssim + x = x.clamp(-1, 1) + xrec = xrec.clamp(-1, 1) + x = (x + 1) / 2 + xrec = (xrec + 1) / 2 + psnr = compute_psnr(xrec, x) + ssim = compute_ssim(xrec, x) + + self.log(f"val{postfix}/psnr", psnr, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log(f"val{postfix}/ssim", ssim, prog_bar=True, logger=True, on_step=True, on_epoch=True) + return log_dict_ae + + def configure_optimizers(self): + if self.trainable_ae_params is None: + ae_params = self.get_autoencoder_params() + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable autoencoder parameters: {len(ae_params):,}") + else: + ae_params, num_ae_params = self.get_param_groups( + self.trainable_ae_params, self.ae_optimizer_args + ) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable autoencoder parameters: {num_ae_params:,}") + if self.trainable_disc_params is None: + disc_params = self.get_discriminator_params() + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable discriminator parameters: {len(disc_params):,}") + else: + disc_params, num_disc_params = self.get_param_groups( + self.trainable_disc_params, self.disc_optimizer_args + ) + print0( + f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Number of trainable discriminator parameters: {num_disc_params:,}" + ) + opt_ae = self.instantiate_optimizer_from_config( + ae_params, + default(self.lr_g_factor, 1.0) * self.learning_rate, + self.optimizer_config, + ) + + if len(disc_params) > 0: + opt_disc = self.instantiate_optimizer_from_config( + disc_params, self.learning_rate, self.optimizer_config + ) + + lr_freq1 = 1 + lr_freq2 = 1 + if not self.use_scheduler_g: + total_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + scheduler1 = ConstantWarmupScheduler(opt_ae, warmup_steps=500, total_steps=total_steps) + else: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use generator lr scheduler: {self.lr_scheduler_config_g.target}") + lr_freq1 = self.lr_scheduler_config_g.params.frequency if hasattr(self.lr_scheduler_config_g.params, 'frequency') else 1 + max_decay_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler max_decay_steps: {max_decay_steps}") + if 'inverse_sqrt' in self.lr_scheduler_config_g.target: + scheduler1 = transformers.get_inverse_sqrt_schedule(optimizer=opt_ae, num_warmup_steps=self.lr_scheduler_config_g.params.num_warmup_steps) + elif 'LambdaWarmUpCosineScheduler' in self.lr_scheduler_config_g.target: + scheduler1 = LambdaWarmUpCosineScheduler(optimizer=opt_ae, total_steps=max_decay_steps, **self.lr_scheduler_config_g.params) + elif 'LinearWarmupScheduler' in self.lr_scheduler_config_g.target: + scheduler1 = LinearWarmupScheduler(opt_ae, total_steps=max_decay_steps, **self.lr_scheduler_config_g.params) + else: + scheduler1 = instantiate_lrscheduler_from_config(opt_ae, self.lr_scheduler_config_g, total_steps=max_decay_steps) + + if not self.use_scheduler_d: + total_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + scheduler2 = ConstantWarmupScheduler(opt_disc, warmup_steps=500, total_steps=total_steps) + else: + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler: {self.lr_scheduler_config_d.target}") + lr_freq2 = self.lr_scheduler_config_d.params.frequency if hasattr(self.lr_scheduler_config_d.params, 'frequency') else 1 + max_decay_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use discriminator lr scheduler max_decay_steps: {max_decay_steps}") + if 'inverse_sqrt' in self.lr_scheduler_config_d.target: + scheduler2 = transformers.get_inverse_sqrt_schedule(optimizer=opt_disc, num_warmup_steps=self.lr_scheduler_config_d.params.num_warmup_steps) + elif 'LambdaWarmUpCosineScheduler' in self.lr_scheduler_config_d.target: + scheduler2 = LambdaWarmUpCosineScheduler(optimizer=opt_disc, total_steps=max_decay_steps, **self.lr_scheduler_config_d.params) + elif 'LinearWarmupScheduler' in self.lr_scheduler_config_d.target: + scheduler2 = LinearWarmupScheduler(opt_disc, total_steps=max_decay_steps, **self.lr_scheduler_config_d.params) + else: + scheduler2 = instantiate_lrscheduler_from_config(opt_disc, self.lr_scheduler_config_d, total_steps=max_decay_steps) + + + lr_scheduler_config1 = { + "optimizer": opt_ae, + "lr_scheduler": { + "scheduler": scheduler1, + "name": "lr_generator", + "interval": "step", + "frequency": lr_freq1, + } + } + lr_scheduler_config2 = { + "optimizer": opt_disc, + "lr_scheduler": { + "scheduler": scheduler2, + "name": "lr_discriminator", + "interval": "step", + "frequency": lr_freq2, + } + } + return (lr_scheduler_config1, lr_scheduler_config2) + + @torch.no_grad() + def log_images(self, batch: Dict, **kwargs) -> Dict: # called at ImageLoggerCallback.log_img() + log = dict() + x = self.get_input(batch) + _, xrec, *_ = self(x) + log["inputs"] = x + log["reconstructions"] = xrec + return log + + +class VidAutoEncoderQformer(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + height_qformer_config: Dict, + width_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.hight_qformer = instantiate_from_config(height_qformer_config) + self.width_qformer = instantiate_from_config(width_qformer_config) + + + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + # (bhw, f, c) -> (bhw, f',c') + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.height_emb = nn.Sequential( + nn.Linear(height_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(height_qformer_config.params.num_query_tokens, self.patch_nums[1], 1), + nn.ReLU(), + ) + + self.width_emb = nn.Sequential( + nn.Linear(width_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(width_qformer_config.params.num_query_tokens, self.patch_nums[2], 1), + nn.ReLU(), + ) + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.hight_qformer.parameters()) + + list(self.width_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.height_emb.parameters()) + + list(self.width_emb.parameters()) + ) + + return params + + + def decode(self, z, z_content, z_motion_x, z_motion_y) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h', w', c_q) + z_motion_x: shape (b, f, h_q, w', c_q) + z_motion_y: shape (b, f, h', w_q, c_q) + ''' + z_content = rearrange(z_content, 'B F H W C -> (B H W) F C') + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + z_motion_x = rearrange(z_motion_x, 'B F H W C -> (B F W) H C') + vx = rearrange(self.height_emb(z_motion_x), '(B F W) H C -> B C F H W', F=z.size(2), W=z.size(4)) + z_motion_y = rearrange(z_motion_y, 'B F H W C -> (B F H) W C') + vy = rearrange(self.width_emb(z_motion_y), '(B F H) W C -> B C F H W', F=z.size(2), H=z.size(3)) + c_plus_m = vt + vx + vy # shape (b, c', f, h', w') + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + z_content = self.temporal_qformer(rearrange(z, 'B C F H W -> (B H W) F C')) + z_content = rearrange(z_content, '(B H W) F C -> B F H W C', H=z.size(3), W=z.size(4)) # compressed in the temporal dimension + z_motion_x = self.hight_qformer(rearrange(z, 'B C F H W -> (B F W) H C')) + z_motion_x = rearrange(z_motion_x, '(B F W) H C -> B F H W C', F=z.size(2), W=z.size(4)) # compressed in the height dimension + z_motion_y = self.width_qformer(rearrange(z, 'B C F H W -> (B F H) W C')) + z_motion_y = rearrange(z_motion_y, '(B F H) W C -> B F H W C', F=z.size(2), H=z.size(3)) # compressed in the width dimension + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, None + return z, z_content, z_motion_x, z_motion_y + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class VidAutoEncoderQformerCompact(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + space_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + retain_num_frames: bool = True, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + repeat_for_decoder: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.space_qformer = instantiate_from_config(space_qformer_config) + + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + self.repeat_for_decoder = repeat_for_decoder + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.down_channel_temp = nn.Linear(self.hidden_dim, self.temporal_down_dim) + self.up_channel_temp = nn.Linear(self.temporal_down_dim, self.hidden_dim) + self.pre_temporal_qformer = nn.Sequential( + nn.Linear(self.temporal_down_dim * self.patch_nums[1] * self.patch_nums[2], self.hidden_dim), + nn.ReLU(), + ) + self.retain_num_frames = retain_num_frames + if not self.retain_num_frames: + self.pre_spatial_qformer = nn.Sequential( + nn.Linear(self.hidden_dim * self.patch_nums[0], 2 * self.hidden_dim), + nn.ReLU(), + nn.Linear(2 * self.hidden_dim, self.hidden_dim), + nn.ReLU(), + ) + if self.repeat_for_decoder: + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + # (bhw, f, c) -> (bhw, f',c') + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.temporal_down_dim * self.patch_nums[1] * self.patch_nums[2]), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + if retain_num_frames: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim * self.patch_nums[0]), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.space_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.spatial_emb.parameters()) + + list(self.pre_temporal_qformer.parameters()) + + list(self.down_channel_temp.parameters()) + ) + if not self.retain_num_frames: + params += list(self.pre_spatial_qformer.parameters()) + if not self.repeat_for_decoder: + params += list(self.up_channel_temp.parameters()) + return params + + def decode(self, z, z_content, z_motion, only_part=None) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + if self.repeat_for_decoder: + z_content = repeat(z_content, 'B F C -> B f F C', f=z.size(2)) + vt = rearrange(self.cont_emb(rearrange(z_content, 'B F A d -> (B F) A d')), '(B f) (H W) C -> B C f H W', H=z.size(3), W=z.size(4), f=z.size(2)) + else: + vt = rearrange(self.cont_emb(z_content), 'B F (C H W) -> B C F H W', H=z.size(3), W=z.size(4)) + vt = self.up_channel_temp(vt.transpose(1, -1)).transpose(1, -1) + if self.retain_num_frames: + vs = rearrange(self.spatial_emb(rearrange(z_motion, 'B F X Y -> (B F) X Y')), '(B F) (H W) C -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + else: + vs = rearrange(self.spatial_emb(z_motion), 'B (H W) (F C) -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + + if self.partial_content_motion == 'content': + c_plus_m = vt + elif self.partial_content_motion == 'motion': + c_plus_m = vs + else: + c_plus_m = vt + vs # shape (b, c', f, h', w') + if only_part == 'content': + c_plus_m = vt + elif only_part == 'motion': + c_plus_m = vs + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = self.pre_temporal_qformer(rearrange(self.down_channel_temp(rearrange(z_shuffled, 'B C F H W -> B F H W C')), 'B F H W C -> B F (H W C)')) + else: + pre_qformer = self.pre_temporal_qformer(rearrange(self.down_channel_temp(rearrange(z, 'B C F H W -> B F H W C')), 'B F H W C -> B F (H W C)')) + z_content = self.temporal_qformer(pre_qformer) # shape (b, f_q, d_q) + layer_norm_content = nn.LayerNorm(z_content.size(-1)).to(z_content.device) + z_content = layer_norm_content(z_content) + + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z, 'B C F H W -> B (H W) (F C)'))) + layer_norm_motion = nn.LayerNorm(z_motion.size(-1)).to(z_motion.device) + z_motion = layer_norm_motion(z_motion) + if return_reg_log: + return z, z_content, z_motion, None + return z, z_content, z_motion + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion + + +class VidAutoEncoderQformerCompactSym(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + space_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + retain_num_frames: bool = True, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + expect_ch: int = 4, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + self.space_qformer = instantiate_from_config(space_qformer_config) + + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.retain_num_frames = retain_num_frames + if not self.retain_num_frames: + self.pre_spatial_qformer = nn.Sequential( + nn.Linear(self.hidden_dim * self.patch_nums[0], 2 * self.hidden_dim), + nn.ReLU(), + nn.Linear(2 * self.hidden_dim, self.hidden_dim), + nn.ReLU(), + ) + + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + if retain_num_frames: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + else: + self.spatial_emb = nn.Sequential( + nn.Linear(space_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim * self.patch_nums[0]), + nn.ReLU(), + nn.Conv1d(space_qformer_config.params.num_query_tokens, self.patch_nums[1] * self.patch_nums[2], 1), + nn.ReLU(), + ) + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.space_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.spatial_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + ) + if not self.retain_num_frames: + params += list(self.pre_spatial_qformer.parameters()) + + return params + + def decode(self, z, z_content, z_motion) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + if self.retain_num_frames: + vs = rearrange(self.spatial_emb(rearrange(z_motion, 'B F X Y -> (B F) X Y')), '(B F) (H W) C -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + else: + vs = rearrange(self.spatial_emb(z_motion), 'B (H W) (F C) -> B C F H W', H=z.size(3), W=z.size(4), F=z.size(2)) + + if self.partial_content_motion == 'content': + c_plus_m = vt + elif self.partial_content_motion == 'motion': + c_plus_m = vs + else: + c_plus_m = vt + vs # shape (b, c', f, h', w') + + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z, 'B C F H W -> B (H W) (F C)'))) + if return_reg_log: + # return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion, None + return z, z_content, z_motion + + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion + + +class VidAutoEncoderQformerCompactSymDis(VidAutoEncoderQformerCompactSym): + + def __init__( + self, + *args, + shuffle_content_ratio: float = 0.5, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.shuffle_content_ratio = shuffle_content_ratio + + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + # shuffle the content frames + x_shuffled = x.clone() + for i in range(x.size(0)): + randn_num = torch.rand(1) + if randn_num < self.shuffle_content_ratio: + idx = torch.randperm(x.size(2)) + x_shuffled[i] = x[i, :, idx, :, :] + x = torch.cat([x, x_shuffled], dim=0) + z = self.encoder(x) # shape (2b, c', f, h', w') + z_orig, z_shuffled = z.chunk(2, dim=0) + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + if self.retain_num_frames: + z_motion = self.space_qformer(rearrange(z_orig, 'B C F H W -> (B F) (H W) C')) # shape (bf, n_q, d_q) + # for each frame, we use qformer to compress the spatial dimension + z_motion = rearrange(z_motion, '(B F) a b -> B F a b', F=z.size(2)) + else: + z_motion = self.space_qformer(self.pre_spatial_qformer(rearrange(z_orig, 'B C F H W -> B (H W) (F C)'))) + if return_reg_log: + # return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion, None + return z, z_content, z_motion + +class VidAutoEncoderQformerCompactSymVid(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + motion_num_blocks: int = 2, + expect_ch: int = 4, + d_dim: int = 16, + # space_qformer_config: Dict, + downsample_motion: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.d_dim = d_dim + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + self.motion_emb = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim), + nn.ReLU() + ) + self.motion_head = nn.Conv2d(self.hidden_dim, self.d_dim, kernel_size=3, stride=1, padding=1) + + self.downsample_motion = downsample_motion + if self.downsample_motion: + motion_downsample_blocks = [] + curr_resol = self.patch_nums[1] + for i in range(motion_num_blocks): + motion_downsample_blocks.append(nn.Conv2d(self.hidden_dim, self.hidden_dim, kernel_size=3, stride=2, padding=1)) + motion_downsample_blocks.append(nn.ReLU()) + curr_resol = (curr_resol + 1) // 2 + self.downsample_motion_module = nn.Sequential(*motion_downsample_blocks) + self.up_motion = nn.Sequential(nn.Linear(curr_resol, self.patch_nums[1]), + nn.ReLU(), + nn.Linear(self.patch_nums[1], self.patch_nums[1]), + nn.ReLU()) + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + list(self.motion_emb.parameters()) + + list(self.motion_head.parameters()) + + ) + if self.downsample_motion: + params += list(self.downsample_motion_module.parameters()) + params += list(self.up_motion.parameters()) + + return params + + def decode(self, z, z_content, z_motion_x, z_motion_y) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + vx = rearrange(self.motion_emb(rearrange(z_motion_x, 'B D F W -> B F W D')), 'B F W C -> B C F W') # shape (b, c', f, w') + vy = rearrange(self.motion_emb(rearrange(z_motion_y, 'B D F H -> B F H D')), 'B F H C -> B C F H') # shape (b, c', f, h') + if self.downsample_motion: + vx = self.up_motion(vx) + vy = self.up_motion(vy) + vx = repeat(vx, 'b c f w -> b c f h w', h=z.size(3)) + vy = repeat(vy, 'b c f h -> b c f h w', w=z.size(4)) + + c_plus_m = vt + vx + vy # shape (b, c', f, h', w') + + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B F H W C', F=z_content.size(1)) + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + z_motion_x, z_motion_y = self.get_motion_latent(z) + + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, None + return z, z_content, z_motion_x, z_motion_y + + def get_motion_latent(self, z: torch.Tensor) -> torch.Tensor: + f = z.size(2) + if self.downsample_motion: + z = self.downsample_motion_module(rearrange(z, 'B C F H W -> (B F) C H W')) + z = rearrange(z, '(B F) C H W -> B C F H W', F=f) + ux = torch.mean(z, dim=-2) # shape (b, c', f, w') + uy = torch.mean(z, dim=-1) # shape (b, c', f, h') + zx = self.motion_head(ux) # shape (b, d, f, w') + zy = self.motion_head(uy) # shape (b, d, f, h') + return zx, zy + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class VidAutoEncoderQformerCompactSymVidVAE(VidAutoEncoderQformerBase): + + def __init__( + self, + *args, + encoder_config: Dict, + decoder_config: Dict, + loss_config: Dict, + regularizer_config: Dict, + temporal_qformer_config: Dict, + lr_scheduler_config_g=None, + lr_scheduler_config_d=None, + trainable_ae_params=None, + ae_optimizer_args = None, + trainable_disc_params = None, + lr_scheduler_config: Dict = None, + weight_decay: float = 1e-5, + disc_optimizer_args = None, + optimizer_config: Union[Dict, None] = None, + lr_g_factor: float = 1.0, + compile_model: bool = False, + temporal_down_dim: int = 32, + partial_content_motion: str = 'all', + shuffle_content: bool = False, + init_ch: int = 128, + cont_num_blocks: int = 2, + motion_num_blocks: int = 2, + expect_ch: int = 4, + d_dim: int = 16, + downsample_motion: bool = False, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ckpt_path2 = kwargs.pop("ckpt_path2", None) + ignore_keys = kwargs.pop("ignore_keys", ()) + super().__init__(*args, **kwargs) + compile = ( + torch.compile + if (version.parse(torch.__version__) >= version.parse("2.0.0")) + and compile_model + else lambda x: x + ) + + self.encoder = compile(instantiate_from_config(encoder_config)) + self.decoder = compile(instantiate_from_config(decoder_config)) + self.loss = instantiate_from_config(loss_config) + self.regularization = instantiate_from_config(regularizer_config) + + # define the qformer + self.temporal_qformer = instantiate_from_config(temporal_qformer_config) + + self.partial_content_motion = partial_content_motion + self.shuffle_content = shuffle_content + + self.use_scheduler = lr_scheduler_config is not None + self.check = 0 + self.weight_decay = weight_decay + if self.use_scheduler: + self.lr_scheduler_config = lr_scheduler_config + self.use_scheduler_g = lr_scheduler_config_g is not None + self.use_scheduler_d = lr_scheduler_config_d is not None + if self.use_scheduler_g: + self.lr_scheduler_config_g = lr_scheduler_config_g + if self.use_scheduler_d: + self.lr_scheduler_config_d = lr_scheduler_config_d + self.optimizer_config = default( + optimizer_config, {"target": "torch.optim.Adam", "params": {"betas": (0, 0.99), "weight_decay": self.weight_decay}}) + self.trainable_ae_params = trainable_ae_params + if self.trainable_ae_params is not None: + self.ae_optimizer_args = default( + ae_optimizer_args, + [{} for _ in range(len(self.trainable_ae_params))], + ) + assert len(self.ae_optimizer_args) == len(self.trainable_ae_params) + else: + self.ae_optimizer_args = [{}] # makes type consitent + self.trainable_disc_params = trainable_disc_params + if self.trainable_disc_params is not None: + self.disc_optimizer_args = default( + disc_optimizer_args, + [{} for _ in range(len(self.trainable_disc_params))], + ) + assert len(self.disc_optimizer_args) == len(self.trainable_disc_params) + else: + self.disc_optimizer_args = [{}] # makes type consitent + + self.lr_g_factor = lr_g_factor + + self.hidden_dim = encoder_config.params.hidden_size + self.patch_nums = np.array(list(encoder_config.params.input_size)) // np.array(list(encoder_config.params.patch_size)) + + self.temporal_down_dim = temporal_down_dim + self.cont_emb = nn.Sequential( + nn.Linear(temporal_qformer_config.params.query_hidden_size, self.hidden_dim), + nn.ReLU(), + nn.Conv1d(temporal_qformer_config.params.num_query_tokens, self.patch_nums[0], 1), + nn.ReLU(), + ) + + self.d_dim = d_dim + + + downsample_blocks = [] + in_channels = temporal_qformer_config.params.query_hidden_size + self.init_ch = init_ch + self.conv_in = nn.Conv2d(in_channels, self.init_ch, kernel_size=3, stride=1, padding=1) + in_channels = self.init_ch + + + for i in range(cont_num_blocks): + out_channels = 2 * in_channels + downsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)) + downsample_blocks.append(nn.ReLU()) + in_channels = out_channels + self.content_downsample_blocks = nn.Sequential(*downsample_blocks) + + self.max_channels = in_channels + upsample_blocks = [] + for i in range(cont_num_blocks): + out_channels = in_channels // 2 + upsample_blocks.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)) + upsample_blocks.append(nn.ReLU()) + upsample_blocks.append(nn.Upsample(scale_factor=2)) + in_channels = out_channels + self.content_upsample_blocks = nn.Sequential(*upsample_blocks) + + + self.bottle_down = nn.Conv2d(self.max_channels, 2*expect_ch, kernel_size=3, stride=1, padding=1) + self.bottle_up = nn.Sequential( + nn.Conv2d(expect_ch, self.max_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU()) + self.conv_out = nn.Conv2d(self.init_ch, temporal_qformer_config.params.query_hidden_size, kernel_size=3, stride=1, padding=1) + + self.motion_emb = nn.Sequential( + nn.Linear(self.d_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.hidden_dim), + nn.ReLU() + ) + self.motion_head = nn.Conv2d(self.hidden_dim, 2*self.d_dim, kernel_size=3, stride=1, padding=1) + + self.downsample_motion = downsample_motion + if self.downsample_motion: + motion_downsample_blocks = [] + curr_resol = self.patch_nums[1] + for i in range(motion_num_blocks): + motion_downsample_blocks.append(nn.Conv2d(self.hidden_dim, self.hidden_dim, kernel_size=3, stride=2, padding=1)) + motion_downsample_blocks.append(nn.ReLU()) + curr_resol = (curr_resol + 1) // 2 + self.downsample_motion_module = nn.Sequential(*motion_downsample_blocks) + self.up_motion = nn.Sequential(nn.Linear(curr_resol, self.patch_nums[1]), + nn.ReLU(), + nn.Linear(self.patch_nums[1], self.patch_nums[1]), + nn.ReLU()) + + + ckpt_path = get_valid_paths(ckpt_path, ckpt_path2) + print0(f"[bold magenta]\[vidtok.models.vidtwin_ae][VidAutoencoderQformer][/bold magenta] Use ckpt_path: {ckpt_path}") + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def get_disentangle_params(self) -> list: + params = ( + list(self.temporal_qformer.parameters()) + + list(self.cont_emb.parameters()) + + list(self.conv_in.parameters()) + + list(self.content_downsample_blocks.parameters()) + + list(self.bottle_down.parameters()) + + list(self.bottle_up.parameters()) + + list(self.conv_out.parameters()) + + list(self.content_upsample_blocks.parameters()) + + list(self.motion_emb.parameters()) + + list(self.motion_head.parameters()) + + ) + if self.downsample_motion: + params += list(self.downsample_motion_module.parameters()) + params += list(self.up_motion.parameters()) + + return params + + + def decode(self, z, z_content, z_motion_x, z_motion_y, only_part=None) -> torch.Tensor: + ''' + input: z: shape (b, c', f, h', w') + z_content: shape (b, f_q, h_q, w_q, c_q) + z_motion: shape (b, [f] , s_q, c_q) + ''' + + z_content_up = self.conv_out(self.content_upsample_blocks(self.bottle_up(rearrange(z_content, 'B F H W C -> (B F) C H W')))) + _,_,h,w = z_content_up.shape + if h > z.size(3): + border = (h - z.size(3)) // 2 + z_content_up = z_content_up[:, :, border:border+z.size(3), border:border+z.size(4)] + z_content = rearrange(z_content_up, '(B F) C H W -> (B H W) F C', F=z_content.size(1)) + vt = rearrange(self.cont_emb(z_content), '(B H W) F C -> B C F H W', H=z.size(3), W=z.size(4)) + + vx = rearrange(self.motion_emb(rearrange(z_motion_x, 'B D F W -> B F W D')), 'B F W C -> B C F W') # shape (b, c', f, w') + vy = rearrange(self.motion_emb(rearrange(z_motion_y, 'B D F H -> B F H D')), 'B F H C -> B C F H') # shape (b, c', f, h') + if self.downsample_motion: + vx = self.up_motion(vx) + vy = self.up_motion(vy) + vx = repeat(vx, 'b c f w -> b c f h w', h=z.size(3)) + vy = repeat(vy, 'b c f h -> b c f h w', w=z.size(4)) + + if only_part == 'content': + c_plus_m = vt + elif only_part == 'motion': + c_plus_m = vx + vy + else: + c_plus_m = vt + vx + vy + x = self.decoder(c_plus_m) + return x + + def encode(self, x: Any, return_reg_log: bool = False) -> Any: + z = self.encoder(x) # shape (b, c', f, h', w') + if self.shuffle_content: + b, c, f, h, w = z.shape + z_shuffled = torch.empty_like(z) + for i in range(b): + idx = torch.randperm(f) + z_shuffled[i] = z[i, :, idx, :, :] + pre_qformer = rearrange(z_shuffled, 'B C F H W -> (B H W) F C') + else: + pre_qformer = rearrange(z, 'B C F H W -> (B H W) F C') + z_content = self.temporal_qformer(pre_qformer) # shape (bhw, f_q, d_q) + z_content_down = self.bottle_down(self.content_downsample_blocks(self.conv_in(rearrange(z_content, '(B H W) F C -> (B F) C H W', H=z.size(3), W=z.size(4))))) + z_content = rearrange(z_content_down, '(B F) C H W -> B C F H W', F=z_content.size(1)) + z_content, content_reglog = self.regularization(z_content) + z_content = rearrange(z_content, 'B C F H W -> B F H W C') + # intuitively, we can view the z_content as a method to retrieve the content frames (including its nums and dims) + z_motion_x, z_motion_y = self.get_motion_latent(z) + z_motion_x, z_motion_x_log = self.regularization(z_motion_x) + z_motion_y, z_motion_y_log = self.regularization(z_motion_y) + reg_log = {} + reg_log['kl_loss'] = content_reglog['kl_loss'] + z_motion_x_log['kl_loss'] + z_motion_y_log['kl_loss'] + if return_reg_log: + return z, z_content, z_motion_x, z_motion_y, reg_log + return z, z_content, z_motion_x, z_motion_y + + def get_motion_latent(self, z: torch.Tensor) -> torch.Tensor: + f = z.size(2) + if self.downsample_motion: + z = self.downsample_motion_module(rearrange(z, 'B C F H W -> (B F) C H W')) + z = rearrange(z, '(B F) C H W -> B C F H W', F=f) + ux = torch.mean(z, dim=-2) # shape (b, c', f, w') + uy = torch.mean(z, dim=-1) # shape (b, c', f, h') + zx = self.motion_head(ux) # shape (b, d, f, w') + zy = self.motion_head(uy) # shape (b, d, f, h') + + return zx, zy + + def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # x: (bs, 3, 17, h, w) + z, z_content, z_motion_x, z_motion_y, reg_log = self.encode(x, return_reg_log=True) + # z: shape (b, c', f, h', w') + + dec = self.decode(z, z_content, z_motion_x, z_motion_y) + # dec: (bs, 3, 17, h, w) + return z, dec, reg_log, z_content, z_motion_x, z_motion_y + + + +class DepthToSpace(nn.Module): + def __init__(self, block_size): + super().__init__() + self.bs, self.bt = block_size + + def forward(self, x): + B, C, N, H, W = x.size() + x = x.view(B, self.bt, self.bs, self.bs, C // ((self.bs ** 2) * self.bt), N, H, W) # (B, bs, bs, bs, C//bs^2, N, H, W) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3).contiguous() # (B, C//bs^3, N, bs, H, bs, W, bs) + x = x.view(B, C // ((self.bs ** 2) * self.bt), N * self.bt, H * self.bs, W * self.bs) # (B, C//bs^3, N * bs, H * bs, W * bs) + # remove the first frame + if self.bt > 1: + x = x[:, :, 1:, :, :] + else: + x = x + return x + + +from torch.optim.lr_scheduler import _LRScheduler + + +class LinearWarmupScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_steps, total_steps, target_lr, last_epoch=-1): + self.warmup_steps = warmup_steps + self.target_lr = target_lr + self.total_steps = total_steps + super(LinearWarmupScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_steps: + # Linear warm-up + return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs] + elif self.last_epoch < self.total_steps: + # Constant learning rate + return [base_lr * (1 - self.last_epoch / self.total_steps) for base_lr in self.base_lrs] + else: + return self.base_lrs + +class ConstantWarmupScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_steps, total_steps, last_epoch=-1): + self.warmup_steps = warmup_steps + self.total_steps = total_steps + # self.base_lrs = lr_max + super(ConstantWarmupScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_steps: + # Linear warm-up + return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs] + elif self.last_epoch < self.total_steps: + # Constant learning rate + return self.base_lrs + +class LambdaWarmUpCosineScheduler(_LRScheduler): + """ + note: use with a base_lr of 1.0 + """ + def __init__(self, optimizer, lr_min, lr_max, lr_start, total_steps, warmup_rate = -1, verbosity_interval=0, last_epoch=-1, warmup_steps=-1): + self.verbosity_interval = verbosity_interval + if warmup_rate >= 0: + self.lr_warm_up_steps = total_steps * warmup_rate + elif warmup_steps >= 0: + self.lr_warm_up_steps = warmup_steps + else: + self.lr_warm_up_steps = 0 + self.lr_start = lr_start + self.lr_min = lr_min + self.lr_max = lr_max + self.lr_max_decay_steps = total_steps + super(LambdaWarmUpCosineScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.verbosity_interval > 0: + if self.last_epoch % self.verbosity_interval == 0: print(f"current step: {self.last_epoch}, recent lr-multiplier: {self.last_lr}") + if self.last_epoch < self.lr_warm_up_steps: + lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * self.last_epoch + self.lr_start + self.last_lr = lr + return [lr] + else: + t = (self.last_epoch - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) + t = min(t, 1.0) + lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( + 1 + np.cos(t * np.pi)) # a + 0.5 * (b - a) * (1 + cos(pi * t)), where t \in [0, 1], so the lr will be in [a, b] + self.last_lr = lr + return [lr] + + + +def instantiate_lrscheduler_from_config(optimizer, config, name='main-LR'): + """ + Instantiate a learning rate scheduler from a config dict. + If use timm, must add the following codes to the LightningModule: + + def lr_scheduler_step(self, scheduler, metric): + if 'timm.scheduler' in self.lr_scheduler_config.target: + scheduler.step(epoch=self.current_epoch) + else: + if metric is None: + scheduler.step() + else: + scheduler.step(metric) + """ + assert 'target' in config, 'Expected key `target` to instantiate.' + if ('torch.optim' in config.target) or ('timm.scheduler' in config.target): + scheduler = get_obj_from_str(config["target"])(optimizer, **config.get("params", dict())) + lr_scheduler = { + 'scheduler': scheduler, + 'name': name + } + else: + scheduler_init = instantiate_from_config(config) + scheduler = LambdaLR(optimizer, lr_lambda=scheduler_init.schedule) + lr_scheduler = { + 'scheduler': LambdaLR(optimizer, lr_lambda=scheduler_init.schedule), + 'name': name, + 'interval': 'step', + 'frequency': 1 + } + return scheduler + + + diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/modules/qformer.py b/Meissonic/vidtok_cache/VidTok/vidtwin/modules/qformer.py new file mode 100644 index 0000000000000000000000000000000000000000..be08e95b15c2fa1b82a0d761547734834e6bf139 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/modules/qformer.py @@ -0,0 +1,654 @@ +# coding=utf-8 +"""PyTorch BLIP-2 model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from transformers.utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from transformers import Blip2QFormerConfig, Blip2PreTrainedModel + + +logger = logging.get_logger(__name__) + +class Blip2QFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +class Blip2QFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention) + self.output = Blip2QFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class Blip2QFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class Blip2QFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Blip2QFormerModel(Blip2PreTrainedModel): + """ + Querying Transformer (Q-Former), used in BLIP-2. + """ + + def __init__(self, config: Blip2QFormerConfig): + super().__init__(config) + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + query_embeds: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +from einops import repeat + +class MyQformerInterface(nn.Module): + def __init__(self, num_query_tokens=3, query_hidden_size=64, encoder_hidden_size=768, num_hidden_layers=6,intermediate_size=768, num_attention_heads=8): + super().__init__() + self.config = Blip2QFormerConfig(hidden_size=query_hidden_size, encoder_hidden_size=encoder_hidden_size, num_hidden_layers=num_hidden_layers, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads) + self.qformer = Blip2QFormerModel(self.config) + self.query_embeds = nn.Parameter(torch.randn(num_query_tokens, query_hidden_size)) + + def forward(self, encoder_hidden_states): + query_batch = repeat(self.query_embeds, 'q d -> b q d', b=encoder_hidden_states.shape[0]) + output = self.qformer(query_embeds=query_batch, encoder_hidden_states=encoder_hidden_states) + return output.last_hidden_state + + +if __name__ == '__main__': + a_former = MyQformerInterface(10, 768, 768) + print('initialized query embeddings', a_former.query_embeds) + test_encoder_hidden_states = torch.randn(2, 16, 768) * 100 + + for name, param in a_former.named_parameters(): + print(name, param.shape) + optim = torch.optim.Adam(a_former.parameters(), lr=0.01) + for i in range(20): + print('running forward pass', i) + output = a_former(test_encoder_hidden_states) + print('loss', output.sum()) + output.sum().backward() + optim.step() + optim.zero_grad() + + print('query embeddings after 10 forward passes', a_former.query_embeds) + diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/modules/st_transformer.py b/Meissonic/vidtok_cache/VidTok/vidtwin/modules/st_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ae93682b034377a6fa7c36a55f6e714462bab28b --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/modules/st_transformer.py @@ -0,0 +1,804 @@ +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from timm.models.layers import DropPath +from timm.models.vision_transformer import Mlp +import torch.nn.functional as F +approx_gelu = lambda: nn.GELU(approximate="tanh") + +from collections.abc import Iterable + +from torch.utils.checkpoint import checkpoint, checkpoint_sequential +from pathlib import Path +from omegaconf import ListConfig +from torch.cuda.amp import autocast + +from einops import rearrange, repeat, reduce, pack, unpack +import pickle + +def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): + assert isinstance(model, nn.Module) + + def set_attr(module): + module.grad_checkpointing = True + module.fp32_attention = use_fp32_attention + module.grad_checkpointing_step = gc_step + + model.apply(set_attr) + + +def auto_grad_checkpoint(module, *args, **kwargs): + if getattr(module, "grad_checkpointing", False): + if not isinstance(module, Iterable): + return checkpoint(module, *args, **kwargs) + gc_step = module[0].grad_checkpointing_step + return checkpoint_sequential(module, gc_step, *args, **kwargs) + return module(*args, **kwargs) + + +def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool): + if use_kernel: + try: + from apex.normalization import FusedLayerNorm + + return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps) + except ImportError: + raise RuntimeError("FusedLayerNorm not available. Please install apex.") + else: + return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine) + + +def t2i_modulate(x, shift, scale): + return x * (1 + scale) + shift + + +class T2IFinalLayer(nn.Module): + """ + The final layer of PixArt. + """ + + def __init__(self, hidden_size, num_patch, out_channels): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True) + self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5) + self.out_channels = out_channels + + def forward(self, x): + shift, scale = (self.scale_shift_table[None]).chunk(2, dim=1) + x = t2i_modulate(self.norm_final(x), shift, scale) + x = self.linear(x) + return x + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + enable_flashattn: bool = False, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.enable_flashattn = enable_flashattn + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor, causal: bool) -> torch.Tensor: + B, N, C = x.shape + qkv = self.qkv(x) + qkv_shape = (B, N, 3, self.num_heads, self.head_dim) + if self.enable_flashattn: + qkv_permute_shape = (2, 0, 1, 3, 4) + else: + qkv_permute_shape = (2, 0, 3, 1, 4) + qkv = qkv.view(qkv_shape).permute(qkv_permute_shape) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + if self.enable_flashattn: + from flash_attn import flash_attn_func + + x = flash_attn_func( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + softmax_scale=self.scale, + causal=causal, + ) + else: + # raise NotImplementedError + dtype = q.dtype + q = q * self.scale + attn = q @ k.transpose(-2, -1) # translate attn to float32 + attn = attn.to(torch.float32) + attn = attn.softmax(dim=-1) + attn = attn.to(dtype) # cast back attn to original dtype + attn = self.attn_drop(attn) + x = attn @ v + + x_output_shape = (B, N, C) + if not self.enable_flashattn: + x = x.transpose(1, 2) + x = x.reshape(x_output_shape) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class GroupAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + enable_flashattn: bool = False, + group_size: int = 4, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.enable_flashattn = enable_flashattn + self.group_size = group_size + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor, causal: bool) -> torch.Tensor: + B, N, C = x.shape + assert N % self.group_size == 0, "sequence length should be divisible by group_size" + G = N // self.group_size + if self.enable_flashattn: + qkv_permute_shape = (2, 0, 1, 3, 4) + else: + qkv_permute_shape = (2, 0, 3, 1, 4) + qkv = self.qkv(x).view(B, N, 3, self.num_heads, self.head_dim).permute(qkv_permute_shape) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + + + if self.enable_flashattn: + # reshape to (B, G, 4, H, D) + q = q.view(B * G, self.group_size, self.num_heads, self.head_dim) + k = k.view(B * G, self.group_size, self.num_heads, self.head_dim) + v = v.view(B * G, self.group_size, self.num_heads, self.head_dim) + from flash_attn import flash_attn_func + + # modify flash_attn_func to support the new shape + x = flash_attn_func( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + softmax_scale=self.scale, + causal=causal, + ).reshape(B, N, C) + else: + q = rearrange(q, "B H S D -> (B G) H N D", G=G) + k = rearrange(k, "B H S D -> (B G) H N D", G=G) + v = rearrange(v, "B H S D -> (B G) H N D", G=G) + q = q * self.scale + attn = (q @ k.transpose(-2, -1)).softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v) + x = rearrange(x, "(B G) H N D -> B S (H D)", G=G, S=N) + + x = self.proj(x) + x = self.proj_drop(x) + return x + +class PatchEmbed3D(nn.Module): + """Video to Patch Embedding. + + Args: + patch_size (int): Patch token size. Default: (2,4,4). + in_chans (int): Number of input video channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, + patch_size=(2, 4, 4), + in_chans=3, + embed_dim=96, + norm_layer=None, + flatten=True, + ): + super().__init__() + self.patch_size = patch_size + self.flatten = flatten + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, D, H, W = x.size() + if W % self.patch_size[2] != 0: + x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) + if H % self.patch_size[1] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) + if D % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) + + x = self.proj(x) # (B 768, 16, 14, 14) patchify, for each patch, we use 768 vector to represent it + if self.norm is not None: + D, Wh, Ww = x.size(2), x.size(3), x.size(4) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC + return x + + + +class STBlock(nn.Module): + def __init__( + self, + hidden_size, + num_heads, + d_s=None, + d_t=None, + mlp_ratio=4.0, + drop_path=0.0, + enable_flashattn=True, + enable_layernorm_kernel=False, + temporal_casual=True, + no_temporal=False, + temporal_group = False, + group_size = 1 + # enable_sequence_parallelism=False, + ): + super().__init__() + self.hidden_size = hidden_size + self.enable_flashattn = enable_flashattn + + self.attn_cls = Attention + self.no_temporal = no_temporal + self.attn_group = GroupAttention + self.temporal_group = temporal_group + self.group_size = group_size + + self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) + self.attn = self.attn_cls( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=enable_flashattn, + ) + self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) + self.mlp = Mlp( + in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0 + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5) + + # temporal attention + self.d_s = d_s + self.d_t = d_t + if self.temporal_group: + self.attn_temp = self.attn_group( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=self.enable_flashattn, + group_size=self.group_size, + ) + else: + self.attn_temp = self.attn_cls( + hidden_size, + num_heads=num_heads, + qkv_bias=True, + enable_flashattn=self.enable_flashattn, + ) + self.temporal_casual = temporal_casual + + def forward(self, x, tpe=None): + + # B, T, S, C = x.shape[0] + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + ).chunk(6, dim=1) + x = x.to(torch.float64) + x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa).to(torch.float64) + + # spatial branch + x_s = rearrange(x_m, "B T S C -> (B T) S C", T=self.d_t, S=self.d_s) + # print(x_s.dtype) + # x_s = x_s.to(torch.float32) + x_s = x_s.to(torch.bfloat16) + x_s = self.attn(x_s, causal=False,).to(torch.bfloat16) + x_s = rearrange(x_s, "(B T) S C -> B T S C", T=self.d_t, S=self.d_s) + x = x + self.drop_path(gate_msa * x_s) + + if not self.no_temporal: + # temporal branch + x_t = rearrange(x, "B T S C -> (B S) T C", T=self.d_t, S=self.d_s) + + if tpe is not None: + x_t = x_t + tpe + x_t = x_t.to(torch.bfloat16) + x_t = self.attn_temp(x_t, causal=self.temporal_casual,) + x_t = rearrange(x_t, "(B S) T C -> B T S C", T=self.d_t, S=self.d_s).to(torch.bfloat16) + x = x + self.drop_path(gate_msa * x_t) + + # mlp + x = x.to(torch.float32) + x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) + x = x.to(torch.float32) + + return x + + +def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0): + pos = np.arange(0, length)[..., None] / scale + return get_1d_sincos_pos_embed_from_grid(embed_dim, pos) + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if not isinstance(grid_size, tuple): + grid_size = (grid_size, grid_size) + + grid_h = np.arange(grid_size[0], dtype=np.float32) / scale + grid_w = np.arange(grid_size[1], dtype=np.float32) / scale + if base_size is not None: + grid_h *= base_size / grid_size[0] + grid_w *= base_size / grid_size[1] + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token and extra_tokens > 0: + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def exists(v): + return v is not None + +def default(v, d): + return v if exists(v) else d + +def divisible_by(num, den): + return (num % den) == 0 + +def is_odd(n): + return not divisible_by(n, 2) + +def cast_tuple(t, length = 1): + if isinstance(t, ListConfig): + return tuple(t) + return t if isinstance(t, tuple) else ((t,) * length) + +class DepthToSpace(nn.Module): + + def __init__(self, block_size): + super().__init__() + self.bs, self.bt = block_size + + def forward(self, x): + B, C, N, H, W = x.size() + x = x.view(B, self.bt, self.bs, self.bs, C // ((self.bs ** 2) * self.bt), N, H, W) # (B, bs, bs, bs, C//bs^2, N, H, W) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3).contiguous() # (B, C//bs^3, N, bs, H, bs, W, bs) + x = x.view(B, C // ((self.bs ** 2) * self.bt), N * self.bt, H * self.bs, W * self.bs) # (B, C//bs^3, N * bs, H * bs, W * bs) + # remove the first frame + if self.bt > 1: + x = x[:, :, 1:, :, :] + else: + x = x + return x + + +# Swish Function +class Swish(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) + + + +class STTransformer(nn.Module): + def __init__( + self, + input_size=(1, 32, 32), + in_channels=4, + patch_size=(1, 2, 2), + hidden_size=1152, + depth=28, + num_heads=16, + mlp_ratio=4.0, + pred_sigma=False, + drop_path=0.0, + no_temporal_pos_emb=False, + space_scale=1.0, + time_scale=1.0, + freeze=None, + enable_flashattn=False, + enable_layernorm_kernel=False, + temporal_casual=True, + no_temporal=False, + temporal_group=False, + group_size=1, + ): + super().__init__() + self.pred_sigma = pred_sigma + self.in_channels = in_channels + self.out_channels = in_channels * 2 if pred_sigma else in_channels + self.hidden_size = hidden_size + self.patch_size = patch_size + self.input_size = input_size + num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)]) + self.num_patches = num_patches + self.num_temporal = input_size[0] // patch_size[0] + self.num_spatial = num_patches // self.num_temporal + self.num_heads = num_heads + self.no_temporal_pos_emb = no_temporal_pos_emb + self.depth = depth + self.mlp_ratio = mlp_ratio + self.enable_flashattn = enable_flashattn + self.enable_layernorm_kernel = enable_layernorm_kernel + self.space_scale = space_scale + self.time_scale = time_scale + self.temporal_casual = temporal_casual + self.temporal_group = temporal_group + self.group_size = group_size + + self.register_buffer("pos_embed", self.get_spatial_pos_embed()) + self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed()) + + self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size) + self.no_temporal = no_temporal + + drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] + self.blocks = nn.ModuleList( + [ + STBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=self.mlp_ratio, + drop_path=drop_path[i], + enable_flashattn=self.enable_flashattn, + enable_layernorm_kernel=self.enable_layernorm_kernel, + d_t=self.num_temporal, + d_s=self.num_spatial, + temporal_casual=self.temporal_casual, + no_temporal=self.no_temporal, + temporal_group = self.temporal_group, + group_size = self.group_size + ) + for i in range(self.depth) + ] + ) + self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) + + # init model + self.initialize_weights() + self.initialize_temporal() + if freeze is not None: + assert freeze in ["not_temporal", "text"] + if freeze == "not_temporal": + self.freeze_not_temporal() + elif freeze == "text": + self.freeze_text() + + + + def forward(self, x): + """ + Forward pass of STDiT. + Args: + x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W] + + Returns: + x (torch.Tensor): output latent representation; of shape [B, C, T, H, W] + """ + + x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) + x = x + self.pos_embed + + with autocast(enabled=True): + for i, block in enumerate(self.blocks): + if i == 0: + tpe = self.pos_embed_temporal + else: + tpe = None + x = auto_grad_checkpoint(block, x, tpe) + + x = rearrange(x, "B T S C -> B (T S) C", T=self.num_temporal, S=self.num_spatial) + return x + + def unpatchify(self, x): + """ + Args: + x (torch.Tensor): of shape [B, N, C] + + Return: + x (torch.Tensor): of shape [B, C_out, T, H, W] + """ + + N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] + T_p, H_p, W_p = self.patch_size + x = rearrange( + x, + "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", + N_t=N_t, + N_h=N_h, + N_w=N_w, + T_p=T_p, + H_p=H_p, + W_p=W_p, + C_out=self.out_channels, + ) + return x + + def unpatchify_old(self, x): + c = self.out_channels + t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] + pt, ph, pw = self.patch_size + + x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) + x = rearrange(x, "n t h w r p q c -> n c t r h p w q") + imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) + return imgs + + def get_spatial_pos_embed(self, grid_size=None): + if grid_size is None: + grid_size = self.input_size[1:] + pos_embed = get_2d_sincos_pos_embed( + self.hidden_size, + (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), + scale=self.space_scale, + ) + pos_embed = torch.from_numpy(pos_embed).unsqueeze(0).requires_grad_(False) + return pos_embed + + def get_temporal_pos_embed(self): + pos_embed = get_1d_sincos_pos_embed( + self.hidden_size, + self.input_size[0] // self.patch_size[0], + scale=self.time_scale, + ) + pos_embed = torch.from_numpy(pos_embed).unsqueeze(0).requires_grad_(False) + return pos_embed + + def freeze_not_temporal(self): + for n, p in self.named_parameters(): + if "attn_temp" not in n: + p.requires_grad = False + + def freeze_text(self): + for n, p in self.named_parameters(): + if "cross_attn" in n: + p.requires_grad = False + + def initialize_temporal(self): + for block in self.blocks: + nn.init.constant_(block.attn_temp.proj.weight, 0) + nn.init.constant_(block.attn_temp.proj.bias, 0) + + def initialize_weights(self): + # Initialize transformer layers: + def _basic_init(module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + self.apply(_basic_init) + + w = self.x_embedder.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + nn.init.constant_(self.final_layer.linear.weight, 0) + nn.init.constant_(self.final_layer.linear.bias, 0) + +class STTEncoder(STTransformer): + def __init__(self, input_size=(1, 32, 32), in_channels=3, patch_size=(1, 2, 2), hidden_size=64, depth=12, num_heads=8, mlp_ratio=4, pred_sigma=False, drop_path=0, no_temporal_pos_emb=False, space_scale=1, time_scale=1, freeze=None, enable_flashattn=True, enable_layernorm_kernel=False, temporal_casual=True, no_temporal=False, temporal_group=False, group_size=1): + super().__init__(input_size, in_channels, patch_size, hidden_size, depth, num_heads, mlp_ratio, pred_sigma, drop_path, no_temporal_pos_emb, space_scale, time_scale, freeze, enable_flashattn, enable_layernorm_kernel, temporal_casual, no_temporal, temporal_group, group_size) + + def forward(self, x): + x = self.x_embedder(x) + y = super().forward(x) + y = rearrange(y, "B (T H W) C -> B C T H W", T=self.input_size[0], H=self.input_size[1]//self.patch_size[1], W=self.input_size[2]//self.patch_size[2]) + return y + + @property + def device(self): + return self.zero.device + + @classmethod + def init_and_load_from(cls, path, strict = True): + path = Path(path) + assert path.exists() + pkg = torch.load(str(path), map_location = 'cpu') + + assert 'config' in pkg, 'model configs were not found in this saved checkpoint' + + config = pickle.loads(pkg['config']) + tokenizer = cls(**config) + tokenizer.load(path, strict = strict) + return tokenizer + + def save(self, path, overwrite = True): + path = Path(path) + assert overwrite or not path.exists(), f'{str(path)} already exists' + + pkg = dict( + model_state_dict = self.state_dict(), + version =self.__version__, + config = self._configs + ) + + torch.save(pkg, str(path)) + + def load(self, path, strict = True): + path = Path(path) + assert path.exists() + + pkg = torch.load(str(path)) + state_dict = pkg.get('model_state_dict') + version = pkg.get('version') + + assert exists(state_dict) + + if exists(version): + print(f'loading checkpointed tokenizer from version {version}') + + self.load_state_dict(state_dict, strict = strict) + + + @torch.no_grad() + def tokenize(self, video): + self.eval() + return self.forward(video, return_codes = True) + + def debug_model(self, x, layer): + if torch.isnan(x).any(): + print('x has nan') + print(layer) + import sys + sys.exit() + + + +class STTDecoder(STTransformer): + def __init__(self, input_size=(1, 32, 32), in_channels=3, patch_size=(1, 2, 2), hidden_size=1152, depth=12, num_heads=16, mlp_ratio=4, pred_sigma=False, drop_path=0, no_temporal_pos_emb=False, space_scale=1, time_scale=1, freeze=None, enable_flashattn=True, enable_layernorm_kernel=False, temporal_casual=True, no_temporal=False): + super().__init__(input_size, in_channels, patch_size, hidden_size, depth, num_heads, mlp_ratio,pred_sigma, drop_path, no_temporal_pos_emb, space_scale, time_scale, freeze, enable_flashattn, enable_layernorm_kernel, temporal_casual, no_temporal) + self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels) + + def forward(self, x): + x = rearrange(x, "B C T H W -> B (T H W) C") + y = super().forward(x) + y = self.final_layer(y) + y = self.unpatchify(y) + return y + + @property + def device(self): + return self.zero.device + + @classmethod + def init_and_load_from(cls, path, strict = True): + path = Path(path) + assert path.exists() + pkg = torch.load(str(path), map_location = 'cpu') + + assert 'config' in pkg, 'model configs were not found in this saved checkpoint' + + config = pickle.loads(pkg['config']) + tokenizer = cls(**config) + tokenizer.load(path, strict = strict) + return tokenizer + + def save(self, path, overwrite = True): + path = Path(path) + assert overwrite or not path.exists(), f'{str(path)} already exists' + + pkg = dict( + model_state_dict = self.state_dict(), + version = self.__version__, + config = self._configs + ) + + torch.save(pkg, str(path)) + + def load(self, path, strict = True): + path = Path(path) + assert path.exists() + + pkg = torch.load(str(path)) + state_dict = pkg.get('model_state_dict') + version = pkg.get('version') + + assert exists(state_dict) + + if exists(version): + print(f'loading checkpointed tokenizer from version {version}') + + self.load_state_dict(state_dict, strict = strict) + + + @torch.no_grad() + def tokenize(self, video): + self.eval() + return self.forward(video, return_codes = True) + + def debug_model(self, x, layer): + if torch.isnan(x).any(): + print('x has nan') + print(layer) + import sys + sys.exit() + + def get_last_layer(self): + return self.final_layer.linear.weight \ No newline at end of file diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_evaluate.py b/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4a7b33dffb2a51c36294124a1ea7079ac70c97 --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_evaluate.py @@ -0,0 +1,208 @@ +import argparse +import os +import sys +sys.path.append(os.getcwd()) + +import warnings +warnings.filterwarnings("ignore") + +import time +import numpy as np +import torch +from contextlib import nullcontext +from pathlib import Path + +import decord +from einops import rearrange +from lightning.pytorch import seed_everything +from omegaconf import OmegaConf +from safetensors.torch import load_file as load_safetensors +from torch import autocast +from torchvision import transforms +from tqdm import tqdm + +from vidtok.modules.lpips import LPIPS +from vidtok.data.vidtok import VidTokValDataset +from vidtok.modules.util import instantiate_from_config, print0, compute_psnr, compute_ssim + + +def load_model_from_config(config, ckpt, verbose=False): + config = OmegaConf.load(config) + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Loading model from {ckpt}") + model = instantiate_from_config(config.model) + + if ckpt.endswith("ckpt"): + sd = torch.load(ckpt, map_location="cpu")["state_dict"] + elif ckpt.endswith("safetensors"): + sd = load_safetensors(ckpt) + else: + raise NotImplementedError(f"Unknown checkpoint: {ckpt}") + + new_sd = {} + for k, v in sd.items(): + if k.startswith("loss"): + continue + new_sd[k] = v + missing, unexpected = model.load_state_dict(new_sd, strict=False) + print0( + f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Restored from {ckpt} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + + if len(missing) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Unexpected Keys: {unexpected}") + return model + + +class MultiVideoDataset(VidTokValDataset): + def __init__( + self, + data_dir, + meta_path=None, + input_height=256, + input_width=256, + num_frames_per_batch=17, + sample_fps=30, + ): + super().__init__( + data_dir=data_dir, + meta_path=meta_path, + video_params={ + "input_height": input_height, + "input_width": input_width, + "sample_num_frames": num_frames_per_batch, + "sample_fps": sample_fps, + }, + pre_load_frames=True, + last_frames_handle="repeat", + ) + + def __getitem__(self, idx): + frames = super().__getitem__(idx)["jpg"] + return frames + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--data_dir", + type=str, + default="", + help="root folder", + ) + parser.add_argument( + "--meta_path", + type=str, + default=None, + help="path to the .csv meta file", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=17, + help="number of frames per batch", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset = MultiVideoDataset( + data_dir=args.data_dir, + meta_path=args.meta_path, + input_height=args.input_height, + input_width=args.input_width, + num_frames_per_batch=args.num_frames_per_batch, + sample_fps=args.sample_fps + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + perceptual_loss = LPIPS().eval() + perceptual_loss = perceptual_loss.to(device) + + psnrs, ssims, lpipss = [], [], [] + + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, output, *_ = model(input) + + output = output.clamp(-1, 1) + input, output = map(lambda x: (x + 1) / 2, (input, output)) + + if input.dim() == 5: + input = rearrange(input, "b c t h w -> (b t) c h w") + assert output.dim() == 5 + output = rearrange(output, "b c t h w -> (b t) c h w") + + psnr = compute_psnr(input, output) + ssim = compute_ssim(input, output) + lpips = perceptual_loss(input * 2 - 1, output * 2 - 1).mean() + + psnrs.append(psnr.item()) + ssims.append(ssim.item()) + lpipss.append(lpips.item()) + + toc = time.time() + print0( + f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] PSNR: {np.mean(psnrs):.4f}, SSIM: {np.mean(ssims):.4f}, LPIPS: {np.mean(lpipss):.4f}" + ) + print0(f"[bold red]\[vidtwin.scripts.inference_evaluate][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_reconstruct.py b/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..e568df36bab66e6e0061aeaee05b2da000afc0ec --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_reconstruct.py @@ -0,0 +1,191 @@ +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import warnings +warnings.filterwarnings("ignore") + +import time +import numpy as np +from contextlib import nullcontext +from pathlib import Path + +import torch +from einops import rearrange +from lightning.pytorch import seed_everything +from torch import autocast +from torchvision.io import write_video +from tqdm import tqdm + +from vidtwin.scripts.inference_evaluate import print0, load_model_from_config, transforms, decord + + +class SingleVideoDataset(torch.utils.data.Dataset): + def __init__(self, video_path, input_height=128, input_width=128, num_frames_per_batch=16, sample_fps=8): + decord.bridge.set_bridge("torch") + self.video_path = video_path + normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + self.transform = transforms.Compose( + [ + transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + normalize, + ] + ) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x : x + num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x : x + num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255.0 + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/vidtok_kl_causal_488_4chn.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="checkpoints/vidtok_kl_causal_488_4chn.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path", + type=str, + default="assets/example.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=17, + help="number of frames per batch", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=30, + help="sample fps", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwininference_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + os.makedirs(args.output_video_dir, exist_ok=True) + + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset = SingleVideoDataset( + args.input_video_path, args.input_height, args.input_width, args.num_frames_per_batch, args.sample_fps + ) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) + + inputs = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input in tqdm(enumerate(dataloader)): + input = input.to(device) + _, xrec, *_ = model(input) + input = rearrange(input, "b c t h w -> (b t) c h w") + inputs.append(input) + xrec = rearrange(xrec, "b c t h w -> (b t) c h w") + outputs.append(xrec) + + toc = time.time() + + # save the outputs as videos + inputs = tensor_to_uint8(torch.cat(inputs, dim=0)) + inputs = rearrange(inputs, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs.shape[0], outputs.shape[0]) + final = np.concatenate([inputs[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"{Path(args.input_video_path).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + + print0(f"[bold red]Results saved in: {output_video_path}[/bold red]") + print0(f"[bold red]\[vidtwin.scripts.inference_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py b/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..69a88da7061da0f669fb62c64f0a7cea207700ce --- /dev/null +++ b/Meissonic/vidtok_cache/VidTok/vidtwin/scripts/inference_vidtwin_cross_reconstruct.py @@ -0,0 +1,264 @@ +import argparse +import datetime +import glob +import inspect +import os +import re +import sys +import numpy as np +import warnings +warnings.filterwarnings("ignore") +from inspect import Parameter +from typing import Union +from matplotlib import pyplot as plt +from natsort import natsorted +from omegaconf import OmegaConf +from packaging import version +from PIL import Image +from pathlib import Path +from tqdm import tqdm + +import torch +import torchvision +import wandb + +import lightning.pytorch as pl +from lightning.pytorch import seed_everything +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.callbacks import Callback +from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +import decord +import time +from einops import rearrange +from contextlib import nullcontext +from torch import autocast +from torchvision import transforms +from torchvision.utils import save_image +from torchvision.io import write_video +from safetensors.torch import load_file as load_safetensors + +from vidtok.modules.util import instantiate_from_config, print0 + + +def load_model_from_config(config, ckpt, verbose=False): + config = OmegaConf.load(config) + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Loading model from {ckpt}") + model = instantiate_from_config(config.model) + + if ckpt.endswith("ckpt"): + sd = torch.load(ckpt, map_location="cpu")["state_dict"] + elif ckpt.endswith("safetensors"): + sd = load_safetensors(ckpt) + else: + raise NotImplementedError(f"Unknown checkpoint: {ckpt}") + + missing, unexpected = model.load_state_dict(sd, strict=False) + print0( + f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Restored from {ckpt} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Missing Keys: {missing}") + if len(unexpected) > 0: + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Unexpected Keys: {unexpected}") + return model + + +class VideoDataset(torch.utils.data.Dataset): + def __init__(self, video_path, input_height=128, input_width=128, sample_fps=8, num_frames_per_batch=16): + decord.bridge.set_bridge("torch") + self.video_path = video_path + normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + self.transform = transforms.Compose([transforms.Resize(input_height, antialias=True), + transforms.CenterCrop((input_height, input_width)), + normalize,]) + + self.video_reader = decord.VideoReader(video_path, num_threads=0) + total_frames = len(self.video_reader) + fps = self.video_reader.get_avg_fps() # float + + interval = round(fps / sample_fps) + frame_ids = list(range(0, total_frames, interval)) + self.frame_ids_batch = [] + for x in range(0, len(frame_ids), num_frames_per_batch): + if len(frame_ids[x:x+num_frames_per_batch]) == num_frames_per_batch: + self.frame_ids_batch.append(frame_ids[x:x+num_frames_per_batch]) + + def __len__(self): + return len(self.frame_ids_batch) + + def __getitem__(self, idx): + frame_ids = self.frame_ids_batch[idx] + frames = self.video_reader.get_batch(frame_ids).permute(0, 3, 1, 2).float() / 255. + frames = self.transform(frames).permute(1, 0, 2, 3) + return frames + + +def tensor_to_uint8(tensor): + tensor = torch.clamp(tensor, -1.0, 1.0) + tensor = (tensor + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w + tensor = (tensor.cpu().numpy() * 255).astype(np.uint8) + return tensor + + +def main(): + def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--seed", + type=int, + default=42, + help="the seed (for reproducible sampling)", + ) + parser.add_argument( + "--precision", + type=str, + help="evaluate at this precision", + choices=["full", "autocast"], + default="full" + ) + parser.add_argument( + "--config", + type=str, + default="configs/tvae3d/webvid_kl_f_16_128_884_8chn_80G4.yaml", + help="path to config which constructs model", + ) + parser.add_argument( + "--ckpt", + type=str, + default="xxxxx.ckpt", + help="path to checkpoint of model", + ) + parser.add_argument( + "--output_video_dir", + type=str, + default="tmp", + help="path to save the outputs", + ) + parser.add_argument( + "--input_video_path_structure", + type=str, + default="logs/assets/Nik.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_video_path_dynamics", + type=str, + default="logs/assets/Nik.mp4", + help="path to the input video", + ) + parser.add_argument( + "--input_height", + type=int, + default=256, + help="height of the input video", + ) + parser.add_argument( + "--input_width", + type=int, + default=256, + help="width of the input video", + ) + parser.add_argument( + "--sample_fps", + type=int, + default=4, + help="", + ) + parser.add_argument( + "--num_frames_per_batch", + type=int, + default=16, + help="", + ) + parser.add_argument( + "--concate_input", + type=str2bool, + const=True, + default=True, + nargs="?", + help="", + ) + parser.add_argument( + "--dynamics_split", + type=str2bool, + default=True, + nargs="?", + help="", + ) + + args = parser.parse_args() + seed_everything(args.seed) + + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Evaluating model {args.ckpt}") + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + precision_scope = autocast if args.precision == "autocast" else nullcontext + + os.makedirs(args.output_video_dir, exist_ok=True) + print(args.ckpt) + print(args.config) + model = load_model_from_config(args.config, args.ckpt) + model.to(device).eval() + + dataset_structure = VideoDataset(args.input_video_path_structure, args.input_height, args.input_width, args.sample_fps, args.num_frames_per_batch) + dataset_dynamics = VideoDataset(args.input_video_path_dynamics, args.input_height, args.input_width, args.sample_fps, args.num_frames_per_batch) + min_len = min(len(dataset_structure), len(dataset_dynamics)) + dataset_structure = torch.utils.data.Subset(dataset_structure, range(min_len)) + dataset_dynamics = torch.utils.data.Subset(dataset_dynamics, range(min_len)) + dataloader_structure = torch.utils.data.DataLoader(dataset_structure, batch_size=1, shuffle=False) + dataloader_dynamics = torch.utils.data.DataLoader(dataset_dynamics, batch_size=1, shuffle=False) + + inputs_structure = [] + inputs_dynamics = [] + outputs = [] + with torch.no_grad(), precision_scope("cuda"): + tic = time.time() + for i, input_structure, input_dynamics in zip(tqdm(range(min_len)), dataloader_structure, dataloader_dynamics): + if input_structure.shape[2] <= 5: + continue + input_structure = input_structure.to(device) + input_dynamics = input_dynamics.to(device) + if args.dynamics_split: + z, z_structure, *_ = model.encode(input_structure) + _, _, z_dynamics_x, z_dynamics_y = model.encode(input_dynamics) + xrec = model.decode(z, z_structure, z_dynamics_x, z_dynamics_y) + else: + z, z_structure, *_ = model.encode(input_structure) + _, _, z_dynamics = model.encode(input_dynamics) + xrec = model.decode(z, z_structure, z_dynamics) + input_structure = rearrange(input_structure, "b c t h w -> (b t) c h w") + inputs_structure.append(input_structure) + input_dynamics = rearrange(input_dynamics, "b c t h w -> (b t) c h w") + inputs_dynamics.append(input_dynamics) + xrec = rearrange(xrec, "b c t h w -> (b t) c h w") + outputs.append(xrec) + toc = time.time() + + # save the outputs as videos + inputs_structure = tensor_to_uint8(torch.cat(inputs_structure, dim=0)) + inputs_structure = rearrange(inputs_structure, "t c h w -> t h w c") + inputs_dynamics = tensor_to_uint8(torch.cat(inputs_dynamics, dim=0)) + inputs_dynamics = rearrange(inputs_dynamics, "t c h w -> t h w c") + outputs = tensor_to_uint8(torch.cat(outputs, dim=0)) + outputs = rearrange(outputs, "t c h w -> t h w c") + min_len = min(inputs_structure.shape[0],inputs_dynamics.shape[0], outputs.shape[0]) + final = np.concatenate([inputs_structure[:min_len], inputs_dynamics[:min_len], outputs[:min_len]], axis=2) if args.concate_input else outputs[:min_len] + + output_video_path = os.path.join(args.output_video_dir, f"structure_{Path(args.input_video_path_structure).stem}_dynamics_{Path(args.input_video_path_dynamics).stem}_reconstructed.mp4") + write_video(output_video_path, final, args.sample_fps) + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Saved the reconstructed video to {output_video_path}") + print0(f"[bold red]\[vidtwin.scripts.inference_vidtwin_cross_reconstruct][/bold red] Time taken: {toc - tic:.2f}s") + +if __name__ == "__main__": + main() diff --git a/Meissonic/vidtok_cache/models--microsoft--VidTok/refs/main b/Meissonic/vidtok_cache/models--microsoft--VidTok/refs/main new file mode 100644 index 0000000000000000000000000000000000000000..0e73e130a000ea965283135cbda0a0137035212c --- /dev/null +++ b/Meissonic/vidtok_cache/models--microsoft--VidTok/refs/main @@ -0,0 +1 @@ +97eb9fb68314830f2fb62d899f3ce7ff15bc9929 \ No newline at end of file diff --git a/Meissonic/vidtok_cache/vidtok_kl_causal_488_4chn.yaml b/Meissonic/vidtok_cache/vidtok_kl_causal_488_4chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d14b0d75435b24affad90095e4d6e42c6525a8d --- /dev/null +++ b/Meissonic/vidtok_cache/vidtok_kl_causal_488_4chn.yaml @@ -0,0 +1,112 @@ +model: + base_learning_rate: 1e-5 + target: vidtok.models.autoencoder.AutoencodingEngine + params: + monitor: val/rec_loss + mode: min + # ckpt_path: checkpoints/vidtok_kl_causal_488_4chn.ckpt # train from existing checkpoint + ignore_keys: [] + # ema_decay: 0.999 + + encoder_config: + target: vidtok.modules.model_3dcausal.EncoderCausal3DPadding + params: + double_z: true + z_channels: 4 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + time_downsample_factor: 4 + num_res_blocks: 2 + dropout: 0.0 + use_checkpoint: false + init_pad_mode: replicate + norm_type: layernorm # layernorm, groupnorm + fix_encoder: false # if True, fix it without updating params + fix_decoder: false # if True, fix it without updating params + + decoder_config: + target: vidtok.modules.model_3dcausal.DecoderCausal3DPadding + params: ${model.params.encoder_config.params} + + regularizer_config: + target: vidtok.modules.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: vidtok.modules.losses.GeneralLPIPSWithDiscriminator + params: + dims: 3 # video - [t,h,w] + perceptual_weight: 1.0 + disc_start: 20001 + disc_weight: 0.2 + disc_type: 2d # 2d, 3d + learn_logvar: true + gen_loss_cross_entropy: true + lecam_loss_weight: 0.005 + regularization_weights: {'aux_loss': 1.0, 'kl_loss': 0.000001} + +data: + target: vidtok.data.datamodule.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 12 + + train: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_1 # DATA_DIR for training data + meta_path: META_PATH_1 # path to the .csv meta file of training data + video_params: + input_height: INPUT_HEIGHT_1 + input_width: INPUT_WIDTH_1 + sample_num_frames: 17 + sample_fps: 3 + + validation: + target: vidtok.data.vidtok.VidTokDataset + params: + data_dir: DATA_DIR_2 # DATA_DIR for validation data + meta_path: META_PATH_2 # path to the .csv meta file of validation data + video_params: + input_height: INPUT_HEIGHT_2 + input_width: INPUT_WIDTH_2 + sample_num_frames: 17 + sample_fps: 8 + start_index: 0 + +lightning: + strategy: + target: lightning.pytorch.strategies.DDPStrategy + params: + find_unused_parameters: true + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + image_logger: + target: vidtok.modules.logger.ImageVideoLogger + params: + disabled: false + rescale: true + enable_autocast: false + batch_frequency: 5000 + max_samples: 2 + increase_log_steps: false + log_first_step: false + log_before_first_step: false + log_images_kwargs: + n_rows: 17 + + trainer: + precision: bf16-mixed + devices: auto + num_nodes: 1 + benchmark: true + num_sanity_val_steps: 10 + val_check_interval: 2000 + check_val_every_n_epoch: null # default: 1 + accumulate_grad_batches: 1 + max_epochs: 1000 diff --git a/Meissonic/wandb/debug-internal.log b/Meissonic/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d8dbd8f1d87e2a1194f2fe70f37102cb3ea4a565 --- /dev/null +++ b/Meissonic/wandb/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-10T11:44:39.85137573Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T11:44:40.016600149Z","level":"INFO","msg":"stream: created new stream","id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016677758Z","level":"INFO","msg":"handler: started","stream_id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016799939Z","level":"INFO","msg":"stream: started","id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016877013Z","level":"INFO","msg":"writer: started","stream_id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016880098Z","level":"INFO","msg":"sender: started","stream_id":"mrtah7xe"} diff --git a/Meissonic/wandb/debug.log b/Meissonic/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2c28c8a071776362325b31a769670028f231f277 --- /dev/null +++ b/Meissonic/wandb/debug.log @@ -0,0 +1,22 @@ +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Configure stats pid to 1312594 +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug.log +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-internal.log +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:init():841] calling init triggers +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:init():889] starting backend +2025-12-10 11:44:39,845 INFO MainThread:1312594 [wandb_init.py:init():892] sending inform_init request +2025-12-10 11:44:39,849 INFO MainThread:1312594 [wandb_init.py:init():900] backend started and connected +2025-12-10 11:44:39,851 INFO MainThread:1312594 [wandb_init.py:init():970] updated telemetry +2025-12-10 11:44:39,855 INFO MainThread:1312594 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 11:44:40,285 INFO MainThread:1312594 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 11:44:40,380 INFO MainThread:1312594 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 11:44:40,380 INFO MainThread:1312594 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/files/config.yaml b/Meissonic/wandb/run-20251206_165839-coce2d90/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18b61239c9c4fd962bb410886f0a44b90a441ef5 --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/files/config.yaml @@ -0,0 +1,257 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + x3dyrlqokh74e256ov74njca6w8ohpdp: + args: + - --model_type + - video + - --text_encoder_architecture + - umt5-base + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --pretrained_model_name_or_path + - path/to/pretrained + - --train_batch_size + - "4" + - --max_train_steps + - "10000" + - --validation_steps + - "500" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624820023296" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T16:58:39.106774Z" + writerId: x3dyrlqokh74e256ov74njca6w8ohpdp + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: false +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: null +model_type: + value: video +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_architecture: + value: Meissonic +pretrained_model_name_or_path: + value: path/to/pretrained +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: null +split_vae_encode: + value: null +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: false +use_8bit_adam: + value: false +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/files/output.log b/Meissonic/wandb/run-20251206_165839-coce2d90/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..431042feeab948447cc1db99b34b247cae3f1b93 --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/files/output.log @@ -0,0 +1,54 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6262.83it/s] +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/configuration_utils.py", line 392, in load_config + config_file = hf_hub_download( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn + validate_repo_id(arg_value) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id + raise HFValidationError( +huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'path/to/pretrained'. Use `repo_type` argument if needed. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1410, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 569, in main + model = WanDiscreteVideoTransformer.from_pretrained( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 1054, in from_pretrained + config, unused_kwargs, commit_hash = cls.load_config( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/configuration_utils.py", line 428, in load_config + raise EnvironmentError( +OSError: We couldn't connect to 'https://huggingface.co' to load this model, couldn't find it in the cached files and it looks like path/to/pretrained is not the path to a directory containing a config.json file. +Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'. +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/configuration_utils.py", line 392, in load_config + config_file = hf_hub_download( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn + validate_repo_id(arg_value) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id + raise HFValidationError( +huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'path/to/pretrained'. Use `repo_type` argument if needed. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1410, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 569, in main + model = WanDiscreteVideoTransformer.from_pretrained( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 1054, in from_pretrained + config, unused_kwargs, commit_hash = cls.load_config( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/configuration_utils.py", line 428, in load_config + raise EnvironmentError( +OSError: We couldn't connect to 'https://huggingface.co' to load this model, couldn't find it in the cached files and it looks like path/to/pretrained is not the path to a directory containing a config.json file. +Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'. diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/files/requirements.txt b/Meissonic/wandb/run-20251206_165839-coce2d90/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_165839-coce2d90/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3a74f9c32ab81114a7dc8cff3b0d372139be890e --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/files/wandb-metadata.json @@ -0,0 +1,120 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T16:58:39.106774Z", + "args": [ + "--model_type", + "video", + "--text_encoder_architecture", + "umt5-base", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--pretrained_model_name_or_path", + "path/to/pretrained", + "--train_batch_size", + "4", + "--max_train_steps", + "10000", + "--validation_steps", + "500", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624820023296" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "x3dyrlqokh74e256ov74njca6w8ohpdp" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/files/wandb-summary.json b/Meissonic/wandb/run-20251206_165839-coce2d90/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1e541721647ba388e7207f3c72c53f101ac4527a --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":5,"_wandb":{"runtime":5}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-core.log b/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..361805cd5207f38c2663b271ddaefe0b6efdb294 --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T16:58:39.529568619Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpkj46qozx/port-3149834.txt","pid":3149834,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T16:58:39.531671658Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3149834} +{"time":"2025-12-06T16:58:39.531734514Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3149834-3150000-3810356159/socket","Net":"unix"}} +{"time":"2025-12-06T16:58:39.641166583Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T16:58:39.653434761Z","level":"INFO","msg":"handleInformInit: received","streamId":"coce2d90","id":"1(@)"} +{"time":"2025-12-06T16:58:39.937371907Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"coce2d90","id":"1(@)"} +{"time":"2025-12-06T16:58:46.159956029Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T16:58:46.160010484Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T16:58:46.16000444Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T16:58:46.160084369Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T16:58:46.160103562Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3149834-3150000-3810356159/socket","Net":"unix"}} +{"time":"2025-12-06T16:58:46.57983654Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T16:58:46.579884575Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T16:58:46.579897335Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-internal.log b/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..02a45ce49f7653aa85663e14d0995c4bf4f9b0e7 --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T16:58:39.65449476Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T16:58:39.935999892Z","level":"INFO","msg":"stream: created new stream","id":"coce2d90"} +{"time":"2025-12-06T16:58:39.937238799Z","level":"INFO","msg":"handler: started","stream_id":"coce2d90"} +{"time":"2025-12-06T16:58:39.93736157Z","level":"INFO","msg":"stream: started","id":"coce2d90"} +{"time":"2025-12-06T16:58:39.937374899Z","level":"INFO","msg":"writer: started","stream_id":"coce2d90"} +{"time":"2025-12-06T16:58:39.937374809Z","level":"INFO","msg":"sender: started","stream_id":"coce2d90"} +{"time":"2025-12-06T16:58:46.160697064Z","level":"INFO","msg":"stream: closing","id":"coce2d90"} +{"time":"2025-12-06T16:58:46.429454486Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T16:58:46.576139882Z","level":"INFO","msg":"handler: closed","stream_id":"coce2d90"} +{"time":"2025-12-06T16:58:46.576233921Z","level":"INFO","msg":"sender: closed","stream_id":"coce2d90"} +{"time":"2025-12-06T16:58:46.576241901Z","level":"INFO","msg":"stream: closed","id":"coce2d90"} diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug.log b/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ccf62ed968a8d9318e60f3cb150e2fd548caaef8 --- /dev/null +++ b/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 16:58:39,109 INFO MainThread:3149834 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 16:58:39,109 INFO MainThread:3149834 [wandb_setup.py:_flush():80] Configure stats pid to 3149834 +2025-12-06 16:58:39,109 INFO MainThread:3149834 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 16:58:39,109 INFO MainThread:3149834 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 16:58:39,109 INFO MainThread:3149834 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 16:58:39,110 INFO MainThread:3149834 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug.log +2025-12-06 16:58:39,110 INFO MainThread:3149834 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_165839-coce2d90/logs/debug-internal.log +2025-12-06 16:58:39,110 INFO MainThread:3149834 [wandb_init.py:init():841] calling init triggers +2025-12-06 16:58:39,110 INFO MainThread:3149834 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 16:58:39,110 INFO MainThread:3149834 [wandb_init.py:init():889] starting backend +2025-12-06 16:58:39,640 INFO MainThread:3149834 [wandb_init.py:init():892] sending inform_init request +2025-12-06 16:58:39,645 INFO MainThread:3149834 [wandb_init.py:init():900] backend started and connected +2025-12-06 16:58:39,647 INFO MainThread:3149834 [wandb_init.py:init():970] updated telemetry +2025-12-06 16:58:39,651 INFO MainThread:3149834 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 16:58:40,179 INFO MainThread:3149834 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 16:58:40,290 INFO MainThread:3149834 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 16:58:40,290 INFO MainThread:3149834 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 16:58:40,290 INFO MainThread:3149834 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 16:58:40,290 INFO MainThread:3149834 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 16:58:40,295 INFO MainThread:3149834 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 16:58:40,295 INFO MainThread:3149834 [wandb_run.py:_config_callback():1396] config_cb None None {'pretrained_model_architecture': 'Meissonic', 'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': False, 'pretrained_model_name_or_path': 'path/to/pretrained', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': False, 'dataloader_num_workers': 0, 'allow_tf32': False, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': None, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': None, 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': None, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': False, 'prompt_prefix': None, 'model_type': 'video', 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 16:58:46,160 INFO wandb-AsyncioManager-main:3149834 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 16:58:46,160 INFO wandb-AsyncioManager-main:3149834 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_165839-coce2d90/run-coce2d90.wandb b/Meissonic/wandb/run-20251206_165839-coce2d90/run-coce2d90.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4f4d60d2359bc2138b135ea6e054aa2b1f2c3805 Binary files /dev/null and b/Meissonic/wandb/run-20251206_165839-coce2d90/run-coce2d90.wandb differ diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/config.yaml b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..23333bcfd69f2a97dbbcdb40e3f36b49383dd2d8 --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/config.yaml @@ -0,0 +1,257 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 8n0dgkjdtpjiqdc6glkrads3rqpkoop6: + args: + - --model_type + - video + - --text_encoder_architecture + - umt5-base + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --pretrained_model_name_or_path + - path/to/pretrained + - --train_batch_size + - "4" + - --max_train_steps + - "10000" + - --validation_steps + - "500" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624820195328" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:00:53.639529Z" + writerId: 8n0dgkjdtpjiqdc6glkrads3rqpkoop6 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: false +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: null +model_type: + value: video +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_architecture: + value: Meissonic +pretrained_model_name_or_path: + value: path/to/pretrained +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: null +split_vae_encode: + value: null +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: false +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/output.log b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8103eb94759d94cf0cb9950021b9d7eb4024aa3a --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/output.log @@ -0,0 +1,18 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11949.58it/s] +12/06/2025 17:01:25 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/06/2025 17:01:33 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/06/2025 17:01:33 - INFO - train.dataset_utils - Using decord for video loading +12/06/2025 17:01:33 - INFO - __main__ - Preparing model, optimizer and dataloaders +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1410, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 902, in main + vq_model.to(device=accelerator.device) +AttributeError: 'NoneType' object has no attribute 'to' +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1410, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 902, in main + vq_model.to(device=accelerator.device) +AttributeError: 'NoneType' object has no attribute 'to' diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/requirements.txt b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..051cf6ab3ceb7534d7ca6a7b0645580fd5e3201e --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/wandb-metadata.json @@ -0,0 +1,120 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:00:53.639529Z", + "args": [ + "--model_type", + "video", + "--text_encoder_architecture", + "umt5-base", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--pretrained_model_name_or_path", + "path/to/pretrained", + "--train_batch_size", + "4", + "--max_train_steps", + "10000", + "--validation_steps", + "500", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624820195328" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "8n0dgkjdtpjiqdc6glkrads3rqpkoop6" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/wandb-summary.json b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3ec09bc7401d0ad79cc5d30b398a997aeec874f5 --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":43},"_runtime":43} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-core.log b/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..774f07adf05509f2ba00763ca805b41449a67558 --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:00:53.710328164Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp3z7mdu80/port-3150329.txt","pid":3150329,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:00:53.711506677Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3150329} +{"time":"2025-12-06T17:00:53.711478145Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3150329-3150497-378485305/socket","Net":"unix"}} +{"time":"2025-12-06T17:00:53.895888189Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:00:53.901507126Z","level":"INFO","msg":"handleInformInit: received","streamId":"qazxw8w7","id":"1(@)"} +{"time":"2025-12-06T17:00:54.068380299Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"qazxw8w7","id":"1(@)"} +{"time":"2025-12-06T17:01:37.788581267Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:01:37.788630676Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:01:37.788621957Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:01:37.788722269Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3150329-3150497-378485305/socket","Net":"unix"}} +{"time":"2025-12-06T17:01:37.788747149Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:01:38.160880449Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:01:38.160903436Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:01:38.160916079Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-internal.log b/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..514994847c772a99fe058e82c9eba6bab5ef9ca8 --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:00:53.901599416Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:00:54.068204039Z","level":"INFO","msg":"stream: created new stream","id":"qazxw8w7"} +{"time":"2025-12-06T17:00:54.068273728Z","level":"INFO","msg":"handler: started","stream_id":"qazxw8w7"} +{"time":"2025-12-06T17:00:54.068371596Z","level":"INFO","msg":"stream: started","id":"qazxw8w7"} +{"time":"2025-12-06T17:00:54.068388013Z","level":"INFO","msg":"writer: started","stream_id":"qazxw8w7"} +{"time":"2025-12-06T17:00:54.06838894Z","level":"INFO","msg":"sender: started","stream_id":"qazxw8w7"} +{"time":"2025-12-06T17:01:37.788642929Z","level":"INFO","msg":"stream: closing","id":"qazxw8w7"} +{"time":"2025-12-06T17:01:38.051197795Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:01:38.157544182Z","level":"INFO","msg":"handler: closed","stream_id":"qazxw8w7"} +{"time":"2025-12-06T17:01:38.157628377Z","level":"INFO","msg":"sender: closed","stream_id":"qazxw8w7"} +{"time":"2025-12-06T17:01:38.157636542Z","level":"INFO","msg":"stream: closed","id":"qazxw8w7"} diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug.log b/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..54e171ec368f226d2290ab86869709441395a914 --- /dev/null +++ b/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_setup.py:_flush():80] Configure stats pid to 3150329 +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug.log +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_170053-qazxw8w7/logs/debug-internal.log +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:00:53,642 INFO MainThread:3150329 [wandb_init.py:init():889] starting backend +2025-12-06 17:00:53,896 INFO MainThread:3150329 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:00:53,900 INFO MainThread:3150329 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:00:53,901 INFO MainThread:3150329 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:00:53,905 INFO MainThread:3150329 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:00:54,287 INFO MainThread:3150329 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:00:54,396 INFO MainThread:3150329 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:00:54,396 INFO MainThread:3150329 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:00:54,396 INFO MainThread:3150329 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:00:54,396 INFO MainThread:3150329 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:00:54,399 INFO MainThread:3150329 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:00:54,400 INFO MainThread:3150329 [wandb_run.py:_config_callback():1396] config_cb None None {'pretrained_model_architecture': 'Meissonic', 'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'path/to/pretrained', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': False, 'dataloader_num_workers': 0, 'allow_tf32': False, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': None, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': None, 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': None, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': False, 'prompt_prefix': None, 'model_type': 'video', 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:01:37,788 INFO wandb-AsyncioManager-main:3150329 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:01:37,788 INFO wandb-AsyncioManager-main:3150329 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_170053-qazxw8w7/run-qazxw8w7.wandb b/Meissonic/wandb/run-20251206_170053-qazxw8w7/run-qazxw8w7.wandb new file mode 100644 index 0000000000000000000000000000000000000000..fd7ad35d5132b83d8013926b9f8d08dd111d99cb Binary files /dev/null and b/Meissonic/wandb/run-20251206_170053-qazxw8w7/run-qazxw8w7.wandb differ diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/config.yaml b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db7395d2e6be6f48a6a0522966cebd6dc94dfd2d --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/config.yaml @@ -0,0 +1,284 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + tmhegmsvd7wi4jjhvzb1irp5883damwm: + args: + - --text_encoder_architecture + - umt5-base + - --pretrained_model_architecture + - Meissonic + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624820785152" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:30:11.294956Z" + writerId: tmhegmsvd7wi4jjhvzb1irp5883damwm + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_architecture: + value: Meissonic +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/output.log b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2dd9d3e3012bdfd3e28be7859946639a51212f6b --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/output.log @@ -0,0 +1,18 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 20095.91it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1031, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 501, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 293, in enable_gradient_checkpointing + raise ValueError( +ValueError: WanDiscreteVideoTransformer does not support gradient checkpointing. Please make sure to set the boolean attribute `_supports_gradient_checkpointing` to `True` in the class definition. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1031, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 501, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 293, in enable_gradient_checkpointing + raise ValueError( +ValueError: WanDiscreteVideoTransformer does not support gradient checkpointing. Please make sure to set the boolean attribute `_supports_gradient_checkpointing` to `True` in the class definition. diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/requirements.txt b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5e9778a709da4172f6684381008f836c3c4e502a --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:30:11.294956Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--pretrained_model_architecture", + "Meissonic", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624820785152" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "tmhegmsvd7wi4jjhvzb1irp5883damwm" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/wandb-summary.json b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..99bd83bf5edb6669d823b5cece522f21f090dd47 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":32,"_wandb":{"runtime":32}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-core.log b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f8078109d5e936310c6a6a3af843f96bbc847463 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:30:11.366354728Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpcd_o9kuo/port-3163546.txt","pid":3163546,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:30:11.366896653Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3163546} +{"time":"2025-12-06T17:30:11.366851433Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3163546-3163757-128500771/socket","Net":"unix"}} +{"time":"2025-12-06T17:30:11.552442435Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:30:11.557785436Z","level":"INFO","msg":"handleInformInit: received","streamId":"1e2kx0pz","id":"1(@)"} +{"time":"2025-12-06T17:30:11.824782627Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"1e2kx0pz","id":"1(@)"} +{"time":"2025-12-06T17:30:44.253664302Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:30:44.253716581Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:30:44.253716581Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:30:44.253814889Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:30:44.253804259Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3163546-3163757-128500771/socket","Net":"unix"}} +{"time":"2025-12-06T17:30:44.629667721Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:30:44.629682278Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:30:44.629690099Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-internal.log b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..36b38603a1ab56032a832b726e86fe6008766eea --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:30:11.557886289Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:30:11.824580053Z","level":"INFO","msg":"stream: created new stream","id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:11.824661235Z","level":"INFO","msg":"handler: started","stream_id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:11.824775327Z","level":"INFO","msg":"stream: started","id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:11.824789427Z","level":"INFO","msg":"writer: started","stream_id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:11.824796821Z","level":"INFO","msg":"sender: started","stream_id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:44.253728752Z","level":"INFO","msg":"stream: closing","id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:44.519783569Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:30:44.626616034Z","level":"INFO","msg":"handler: closed","stream_id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:44.626690867Z","level":"INFO","msg":"sender: closed","stream_id":"1e2kx0pz"} +{"time":"2025-12-06T17:30:44.626698374Z","level":"INFO","msg":"stream: closed","id":"1e2kx0pz"} diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug.log b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..64a341b04243ef0bd2c309301a27761e29c0b91b --- /dev/null +++ b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_setup.py:_flush():80] Configure stats pid to 3163546 +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug.log +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_173011-1e2kx0pz/logs/debug-internal.log +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:30:11,298 INFO MainThread:3163546 [wandb_init.py:init():889] starting backend +2025-12-06 17:30:11,552 INFO MainThread:3163546 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:30:11,556 INFO MainThread:3163546 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:30:11,557 INFO MainThread:3163546 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:30:11,561 INFO MainThread:3163546 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:30:12,066 INFO MainThread:3163546 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:30:12,173 INFO MainThread:3163546 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:30:12,173 INFO MainThread:3163546 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:30:12,173 INFO MainThread:3163546 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:30:12,173 INFO MainThread:3163546 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:30:12,176 INFO MainThread:3163546 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:30:12,177 INFO MainThread:3163546 [wandb_run.py:_config_callback():1396] config_cb None None {'pretrained_model_architecture': 'Meissonic', 'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:30:44,253 INFO wandb-AsyncioManager-main:3163546 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:30:44,253 INFO wandb-AsyncioManager-main:3163546 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_173011-1e2kx0pz/run-1e2kx0pz.wandb b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/run-1e2kx0pz.wandb new file mode 100644 index 0000000000000000000000000000000000000000..58ee2635b7e21b12277929900f11633614dabda9 Binary files /dev/null and b/Meissonic/wandb/run-20251206_173011-1e2kx0pz/run-1e2kx0pz.wandb differ diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/files/config.yaml b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c4d6eef697a400ee6c0f3611b6053cf66ce83a3 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 1wi5ybsxmpos30re2r2lptdb51avrp0x: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624821026816" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:36:07.618799Z" + writerId: 1wi5ybsxmpos30re2r2lptdb51avrp0x + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/files/output.log b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3b1ca9633209941d6f29c32487dc0b5e6d4d4341 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/output.log @@ -0,0 +1,22 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11012.80it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 489, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 310, in enable_gradient_checkpointing + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 1814, in _set_gradient_checkpointing + raise ValueError( +ValueError: The module WanDiscreteVideoTransformer does not support gradient checkpointing. Please make sure to use a module that supports gradient checkpointing by creating a boolean attribute `gradient_checkpointing`. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 489, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 310, in enable_gradient_checkpointing + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 1814, in _set_gradient_checkpointing + raise ValueError( +ValueError: The module WanDiscreteVideoTransformer does not support gradient checkpointing. Please make sure to use a module that supports gradient checkpointing by creating a boolean attribute `gradient_checkpointing`. diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/files/requirements.txt b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6ee5fca55c043fafddd9db6c77b1a45538b71168 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:36:07.618799Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624821026816" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "1wi5ybsxmpos30re2r2lptdb51avrp0x" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/files/wandb-summary.json b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5cdaebc2683b09a548d460a96137551609f838 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":31,"_wandb":{"runtime":31}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-core.log b/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..41bc1769f37ef48b839518dca9cbceb7fac54df1 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:36:07.688887018Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpc_m3iyvi/port-3166629.txt","pid":3166629,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:36:07.68934107Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3166629} +{"time":"2025-12-06T17:36:07.689341347Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3166629-3166847-399507443/socket","Net":"unix"}} +{"time":"2025-12-06T17:36:07.872836592Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:36:07.878520082Z","level":"INFO","msg":"handleInformInit: received","streamId":"2pns5nly","id":"1(@)"} +{"time":"2025-12-06T17:36:08.049585246Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2pns5nly","id":"1(@)"} +{"time":"2025-12-06T17:36:39.715969818Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:36:39.716046804Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:36:39.716046734Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:36:39.716087561Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:36:39.716121228Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3166629-3166847-399507443/socket","Net":"unix"}} +{"time":"2025-12-06T17:36:40.095380502Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:36:40.095405659Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:36:40.095419285Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-internal.log b/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..66fde455b42b2c253c54080bea90cfea07aa139f --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:36:07.878612307Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:36:08.049456127Z","level":"INFO","msg":"stream: created new stream","id":"2pns5nly"} +{"time":"2025-12-06T17:36:08.049512949Z","level":"INFO","msg":"handler: started","stream_id":"2pns5nly"} +{"time":"2025-12-06T17:36:08.049577902Z","level":"INFO","msg":"stream: started","id":"2pns5nly"} +{"time":"2025-12-06T17:36:08.049590801Z","level":"INFO","msg":"writer: started","stream_id":"2pns5nly"} +{"time":"2025-12-06T17:36:08.049591041Z","level":"INFO","msg":"sender: started","stream_id":"2pns5nly"} +{"time":"2025-12-06T17:36:39.71602136Z","level":"INFO","msg":"stream: closing","id":"2pns5nly"} +{"time":"2025-12-06T17:36:39.976843494Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:36:40.092395229Z","level":"INFO","msg":"handler: closed","stream_id":"2pns5nly"} +{"time":"2025-12-06T17:36:40.092474944Z","level":"INFO","msg":"sender: closed","stream_id":"2pns5nly"} +{"time":"2025-12-06T17:36:40.092481538Z","level":"INFO","msg":"stream: closed","id":"2pns5nly"} diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug.log b/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0f232aa892b484ed3d285025d575b7e26bb5fc53 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_setup.py:_flush():80] Configure stats pid to 3166629 +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug.log +2025-12-06 17:36:07,621 INFO MainThread:3166629 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_173607-2pns5nly/logs/debug-internal.log +2025-12-06 17:36:07,622 INFO MainThread:3166629 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:36:07,622 INFO MainThread:3166629 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:36:07,622 INFO MainThread:3166629 [wandb_init.py:init():889] starting backend +2025-12-06 17:36:07,873 INFO MainThread:3166629 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:36:07,876 INFO MainThread:3166629 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:36:07,878 INFO MainThread:3166629 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:36:07,882 INFO MainThread:3166629 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:36:08,279 INFO MainThread:3166629 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:36:08,386 INFO MainThread:3166629 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:36:08,386 INFO MainThread:3166629 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:36:08,387 INFO MainThread:3166629 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:36:08,387 INFO MainThread:3166629 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:36:08,389 INFO MainThread:3166629 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:36:08,390 INFO MainThread:3166629 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:36:39,716 INFO wandb-AsyncioManager-main:3166629 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:36:39,716 INFO wandb-AsyncioManager-main:3166629 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_173607-2pns5nly/run-2pns5nly.wandb b/Meissonic/wandb/run-20251206_173607-2pns5nly/run-2pns5nly.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e4573b53ceee89abd4c4f8e16be9d9b29080eebe Binary files /dev/null and b/Meissonic/wandb/run-20251206_173607-2pns5nly/run-2pns5nly.wandb differ diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/config.yaml b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cd24cae1d804602e0b6b3a93249058789e8e4b0b --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 81orl7h4sniz072dwcgm19khkg0m6rqr: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624821116928" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:37:32.288087Z" + writerId: 81orl7h4sniz072dwcgm19khkg0m6rqr + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/output.log b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..276f6cd92aebe7cef1de7846d0b0e07d2e7aa63c --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/output.log @@ -0,0 +1,22 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8832.77it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 489, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 310, in enable_gradient_checkpointing + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 1814, in _set_gradient_checkpointing + raise ValueError( +ValueError: The module WanDiscreteVideoTransformer does not support gradient checkpointing. Please make sure to use a module that supports gradient checkpointing by creating a boolean attribute `gradient_checkpointing`. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 489, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 310, in enable_gradient_checkpointing + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 1814, in _set_gradient_checkpointing + raise ValueError( +ValueError: The module WanDiscreteVideoTransformer does not support gradient checkpointing. Please make sure to use a module that supports gradient checkpointing by creating a boolean attribute `gradient_checkpointing`. diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/requirements.txt b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4752914986bca621f36f015c46138709f64b9f06 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:37:32.288087Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624821116928" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "81orl7h4sniz072dwcgm19khkg0m6rqr" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/wandb-summary.json b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5cdaebc2683b09a548d460a96137551609f838 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":31,"_wandb":{"runtime":31}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-core.log b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f480aff8c13a37e2d0001b534dd80d486991ee80 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:37:32.355956202Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmptokzno3s/port-3167643.txt","pid":3167643,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:37:32.356424898Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3167643} +{"time":"2025-12-06T17:37:32.35642138Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3167643-3167847-1825990437/socket","Net":"unix"}} +{"time":"2025-12-06T17:37:32.541515322Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:37:32.547491235Z","level":"INFO","msg":"handleInformInit: received","streamId":"hbmnn4a2","id":"1(@)"} +{"time":"2025-12-06T17:37:32.717488298Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hbmnn4a2","id":"1(@)"} +{"time":"2025-12-06T17:38:03.945999972Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:38:03.946047939Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:38:03.946041413Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:38:03.946098759Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:38:03.946150788Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3167643-3167847-1825990437/socket","Net":"unix"}} +{"time":"2025-12-06T17:38:04.427669826Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:38:04.427693144Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:38:04.427703017Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-internal.log b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3dfb41c78ad617786b9b1c23f24f044649477587 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:37:32.547587967Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:37:32.71728007Z","level":"INFO","msg":"stream: created new stream","id":"hbmnn4a2"} +{"time":"2025-12-06T17:37:32.717356963Z","level":"INFO","msg":"handler: started","stream_id":"hbmnn4a2"} +{"time":"2025-12-06T17:37:32.717467277Z","level":"INFO","msg":"sender: started","stream_id":"hbmnn4a2"} +{"time":"2025-12-06T17:37:32.717451947Z","level":"INFO","msg":"stream: started","id":"hbmnn4a2"} +{"time":"2025-12-06T17:37:32.717467498Z","level":"INFO","msg":"writer: started","stream_id":"hbmnn4a2"} +{"time":"2025-12-06T17:38:03.946051371Z","level":"INFO","msg":"stream: closing","id":"hbmnn4a2"} +{"time":"2025-12-06T17:38:04.221269445Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:38:04.424403842Z","level":"INFO","msg":"handler: closed","stream_id":"hbmnn4a2"} +{"time":"2025-12-06T17:38:04.424486194Z","level":"INFO","msg":"sender: closed","stream_id":"hbmnn4a2"} +{"time":"2025-12-06T17:38:04.424493894Z","level":"INFO","msg":"stream: closed","id":"hbmnn4a2"} diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug.log b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8e38d77348734fb251701ce08ab3fef599826681 --- /dev/null +++ b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:37:32,290 INFO MainThread:3167643 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:37:32,290 INFO MainThread:3167643 [wandb_setup.py:_flush():80] Configure stats pid to 3167643 +2025-12-06 17:37:32,290 INFO MainThread:3167643 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug.log +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_173732-hbmnn4a2/logs/debug-internal.log +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:37:32,291 INFO MainThread:3167643 [wandb_init.py:init():889] starting backend +2025-12-06 17:37:32,541 INFO MainThread:3167643 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:37:32,545 INFO MainThread:3167643 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:37:32,547 INFO MainThread:3167643 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:37:32,552 INFO MainThread:3167643 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:37:32,825 INFO MainThread:3167643 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:37:32,933 INFO MainThread:3167643 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:37:32,933 INFO MainThread:3167643 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:37:32,933 INFO MainThread:3167643 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:37:32,933 INFO MainThread:3167643 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:37:32,936 INFO MainThread:3167643 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:37:32,937 INFO MainThread:3167643 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:38:03,946 INFO wandb-AsyncioManager-main:3167643 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:38:03,946 INFO wandb-AsyncioManager-main:3167643 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_173732-hbmnn4a2/run-hbmnn4a2.wandb b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/run-hbmnn4a2.wandb new file mode 100644 index 0000000000000000000000000000000000000000..14ba5034bc27a814e6eee46cbce381e0b3da236c Binary files /dev/null and b/Meissonic/wandb/run-20251206_173732-hbmnn4a2/run-hbmnn4a2.wandb differ diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/config.yaml b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d8eaeefa3f5fd6f8411ff7d80ce93b60ffa60587 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + pmoejbr5xykqccyfcdcti16fyunauy6g: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624821260288" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:40:40.016569Z" + writerId: pmoejbr5xykqccyfcdcti16fyunauy6g + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/output.log b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b9cf3eb4a387db1248ba4bdd3c044522ebf0fca9 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/output.log @@ -0,0 +1,18 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3889.27it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 489, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 310, in enable_gradient_checkpointing + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) +TypeError: WanDiscreteVideoTransformer._set_gradient_checkpointing() got an unexpected keyword argument 'enable' +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 489, in main + model.enable_gradient_checkpointing() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 310, in enable_gradient_checkpointing + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) +TypeError: WanDiscreteVideoTransformer._set_gradient_checkpointing() got an unexpected keyword argument 'enable' diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/requirements.txt b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2cb4b66e20e50a9945e7a8a3e27687ddda84e206 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:40:40.016569Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624821260288" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "pmoejbr5xykqccyfcdcti16fyunauy6g" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/wandb-summary.json b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..2f60d8f4cdfc22e8f1819d9e97d11818d2cc95da --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":31},"_runtime":31} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-core.log b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2125bfbf40f36e82f14e1f7d0381c658b9a765cd --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:40:40.083473983Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpzfcwrwno/port-3169440.txt","pid":3169440,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:40:40.083904386Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3169440} +{"time":"2025-12-06T17:40:40.083906365Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3169440-3169642-2293423371/socket","Net":"unix"}} +{"time":"2025-12-06T17:40:40.27059992Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:40:40.27616566Z","level":"INFO","msg":"handleInformInit: received","streamId":"4zw2qz8d","id":"1(@)"} +{"time":"2025-12-06T17:40:40.44413062Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"4zw2qz8d","id":"1(@)"} +{"time":"2025-12-06T17:41:12.609387148Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:41:12.609433524Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:41:12.609431703Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:41:12.609474554Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:41:12.609506336Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3169440-3169642-2293423371/socket","Net":"unix"}} +{"time":"2025-12-06T17:41:12.995940468Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:41:12.995964663Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:41:12.99597316Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-internal.log b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cf62be558937ec658b802c238118da9287d51452 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:40:40.276305443Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:40:40.443947419Z","level":"INFO","msg":"stream: created new stream","id":"4zw2qz8d"} +{"time":"2025-12-06T17:40:40.444041233Z","level":"INFO","msg":"handler: started","stream_id":"4zw2qz8d"} +{"time":"2025-12-06T17:40:40.444123437Z","level":"INFO","msg":"stream: started","id":"4zw2qz8d"} +{"time":"2025-12-06T17:40:40.444141896Z","level":"INFO","msg":"writer: started","stream_id":"4zw2qz8d"} +{"time":"2025-12-06T17:40:40.444144802Z","level":"INFO","msg":"sender: started","stream_id":"4zw2qz8d"} +{"time":"2025-12-06T17:41:12.609439037Z","level":"INFO","msg":"stream: closing","id":"4zw2qz8d"} +{"time":"2025-12-06T17:41:12.904064573Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:41:12.992754035Z","level":"INFO","msg":"handler: closed","stream_id":"4zw2qz8d"} +{"time":"2025-12-06T17:41:12.992819371Z","level":"INFO","msg":"sender: closed","stream_id":"4zw2qz8d"} +{"time":"2025-12-06T17:41:12.992826419Z","level":"INFO","msg":"stream: closed","id":"4zw2qz8d"} diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug.log b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9281537b9ba35836a74624db8c4c7febbeb830af --- /dev/null +++ b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_setup.py:_flush():80] Configure stats pid to 3169440 +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug.log +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_174040-4zw2qz8d/logs/debug-internal.log +2025-12-06 17:40:40,020 INFO MainThread:3169440 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:40:40,021 INFO MainThread:3169440 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:40:40,021 INFO MainThread:3169440 [wandb_init.py:init():889] starting backend +2025-12-06 17:40:40,270 INFO MainThread:3169440 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:40:40,274 INFO MainThread:3169440 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:40:40,275 INFO MainThread:3169440 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:40:40,279 INFO MainThread:3169440 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:40:40,671 INFO MainThread:3169440 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:40:40,778 INFO MainThread:3169440 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:40:40,778 INFO MainThread:3169440 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:40:40,778 INFO MainThread:3169440 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:40:40,778 INFO MainThread:3169440 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:40:40,781 INFO MainThread:3169440 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:40:40,781 INFO MainThread:3169440 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:41:12,609 INFO wandb-AsyncioManager-main:3169440 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:41:12,609 INFO wandb-AsyncioManager-main:3169440 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_174040-4zw2qz8d/run-4zw2qz8d.wandb b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/run-4zw2qz8d.wandb new file mode 100644 index 0000000000000000000000000000000000000000..37add921722b682ea95c74d8a2c00e30f1650cde Binary files /dev/null and b/Meissonic/wandb/run-20251206_174040-4zw2qz8d/run-4zw2qz8d.wandb differ diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/config.yaml b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c59a26a6d1bd06457cff93903a4a3a58653b90e --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + q1a40zzinx990om27uhkydc0fepq8lyl: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11624821403648" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:43:50.375695Z" + writerId: q1a40zzinx990om27uhkydc0fepq8lyl + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/output.log b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..64e6742914e4fea45a06d182d6421877d3c258d8 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/output.log @@ -0,0 +1,28 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4208.13it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 584, in main + import bitsandbytes as bnb +ModuleNotFoundError: No module named 'bitsandbytes' + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 586, in main + raise ImportError( +ImportError: Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes` +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 584, in main + import bitsandbytes as bnb +ModuleNotFoundError: No module named 'bitsandbytes' + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 586, in main + raise ImportError( +ImportError: Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes` diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/requirements.txt b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..29f8ef97a1a3b2dda14232bbc1d655bc5e428b19 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/requirements.txt @@ -0,0 +1,138 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..933a55f88ba6acac8c4bb802056b411abfda9801 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:43:50.375695Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11624821403648" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "q1a40zzinx990om27uhkydc0fepq8lyl" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/wandb-summary.json b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f8324c5f59b32d7c8f40b6c860a387c4c76c0c15 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":32},"_runtime":32} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-core.log b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..e90a0d8ed54f51da4ef1ca9184d73b89c188481a --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:43:50.445359103Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp_jg2vina/port-3171238.txt","pid":3171238,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:43:50.445779509Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3171238} +{"time":"2025-12-06T17:43:50.445794062Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3171238-3171446-614868002/socket","Net":"unix"}} +{"time":"2025-12-06T17:43:50.632760026Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:43:50.638385451Z","level":"INFO","msg":"handleInformInit: received","streamId":"3l5a8i6m","id":"1(@)"} +{"time":"2025-12-06T17:43:50.907191091Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"3l5a8i6m","id":"1(@)"} +{"time":"2025-12-06T17:44:23.462048752Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:44:23.462086106Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:44:23.462083734Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:44:23.462221941Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3171238-3171446-614868002/socket","Net":"unix"}} +{"time":"2025-12-06T17:44:23.462251991Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:44:23.806523343Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:44:23.806545257Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:44:23.806554695Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-internal.log b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..923f5bd10e38724b4b29ea14edbffc419d512a34 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:43:50.638478727Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:43:50.907006552Z","level":"INFO","msg":"stream: created new stream","id":"3l5a8i6m"} +{"time":"2025-12-06T17:43:50.907081307Z","level":"INFO","msg":"handler: started","stream_id":"3l5a8i6m"} +{"time":"2025-12-06T17:43:50.907182878Z","level":"INFO","msg":"stream: started","id":"3l5a8i6m"} +{"time":"2025-12-06T17:43:50.907201966Z","level":"INFO","msg":"writer: started","stream_id":"3l5a8i6m"} +{"time":"2025-12-06T17:43:50.907205195Z","level":"INFO","msg":"sender: started","stream_id":"3l5a8i6m"} +{"time":"2025-12-06T17:44:23.462101845Z","level":"INFO","msg":"stream: closing","id":"3l5a8i6m"} +{"time":"2025-12-06T17:44:23.71012953Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:44:23.803029793Z","level":"INFO","msg":"handler: closed","stream_id":"3l5a8i6m"} +{"time":"2025-12-06T17:44:23.803114706Z","level":"INFO","msg":"sender: closed","stream_id":"3l5a8i6m"} +{"time":"2025-12-06T17:44:23.803121867Z","level":"INFO","msg":"stream: closed","id":"3l5a8i6m"} diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug.log b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ed3370b9e1a242c711916fd14bc852a0858aaed0 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_setup.py:_flush():80] Configure stats pid to 3171238 +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug.log +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_174350-3l5a8i6m/logs/debug-internal.log +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:43:50,378 INFO MainThread:3171238 [wandb_init.py:init():889] starting backend +2025-12-06 17:43:50,633 INFO MainThread:3171238 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:43:50,637 INFO MainThread:3171238 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:43:50,638 INFO MainThread:3171238 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:43:50,642 INFO MainThread:3171238 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:43:51,086 INFO MainThread:3171238 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:43:51,195 INFO MainThread:3171238 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:43:51,195 INFO MainThread:3171238 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:43:51,195 INFO MainThread:3171238 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:43:51,195 INFO MainThread:3171238 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:43:51,198 INFO MainThread:3171238 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:43:51,198 INFO MainThread:3171238 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:44:23,462 INFO wandb-AsyncioManager-main:3171238 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:44:23,462 INFO wandb-AsyncioManager-main:3171238 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_174350-3l5a8i6m/run-3l5a8i6m.wandb b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/run-3l5a8i6m.wandb new file mode 100644 index 0000000000000000000000000000000000000000..df3aa07343720b5ff598752835905d435192fb6e Binary files /dev/null and b/Meissonic/wandb/run-20251206_174350-3l5a8i6m/run-3l5a8i6m.wandb differ diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/files/config.yaml b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aba9cea1cde6a944caffb88b21f768fe50eb2205 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 88g6l38gdjpfyz6jlyw7nube4byh68rn: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625034280960" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:44:48.713011Z" + writerId: 88g6l38gdjpfyz6jlyw7nube4byh68rn + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/files/output.log b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..dfd21aaef60d9d385f715ad9ee525ee05bc5de49 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/output.log @@ -0,0 +1,80 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6292.35it/s] +12/06/2025 17:45:20 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/06/2025 17:45:27 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/06/2025 17:45:27 - INFO - train.dataset_utils - Using decord for video loading +12/06/2025 17:45:27 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/06/2025 17:45:31 - INFO - __main__ - ***** Running training ***** +12/06/2025 17:45:31 - INFO - __main__ - Num training steps = 10000 +12/06/2025 17:45:31 - INFO - __main__ - Instantaneous batch size per device = 4 +12/06/2025 17:45:31 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/06/2025 17:45:31 - INFO - __main__ - Gradient Accumulation steps = 1 +12/06/2025 17:45:31 - WARNING - train.dataset_utils - Failed to load video 000/794/643/794643.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:32 - WARNING - train.dataset_utils - Failed to load video 000/542/931/542931.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:32 - WARNING - train.dataset_utils - Failed to load video 000/031/953/31953.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:32 - WARNING - train.dataset_utils - Failed to load video 000/971/703/971703.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:33 - WARNING - train.dataset_utils - Failed to load video 000/505/892/505892.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:33 - WARNING - train.dataset_utils - Failed to load video 000/732/292/732292.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:33 - WARNING - train.dataset_utils - Failed to load video 000/648/233/648233.mp4: 'Tensor' object has no attribute 'asnumpy' +12/06/2025 17:45:33 - WARNING - train.dataset_utils - Failed to load video 000/101/276/101276.mp4: 'Tensor' object has no attribute 'asnumpy' +[DEBUG-transformer] Input: tokens.shape=torch.Size([4, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([4, 512, 768]), timesteps.shape=torch.Size([4]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 847, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 893, in forward + print(f"[DEBUG-transformer] Input: tokens.shape={tokens.shape}, encoder_hidden_states.shape={encoder_hidden_states.shape}, timesteps.shape={timesteps.shape}") + File "/mnt/Meissonic/src/transformer_video.py", line 894, in torch_dynamo_resume_in_forward_at_893 + x_list = self._tokens_to_video(tokens) + File "/mnt/Meissonic/src/transformer_video.py", line 838, in _tokens_to_video + assert f == self.num_frames, f'num_frames mismatch: config={self.num_frames}, input={f}' +AssertionError: num_frames mismatch: config=2, input=3 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1019, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 847, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 893, in forward + print(f"[DEBUG-transformer] Input: tokens.shape={tokens.shape}, encoder_hidden_states.shape={encoder_hidden_states.shape}, timesteps.shape={timesteps.shape}") + File "/mnt/Meissonic/src/transformer_video.py", line 894, in torch_dynamo_resume_in_forward_at_893 + x_list = self._tokens_to_video(tokens) + File "/mnt/Meissonic/src/transformer_video.py", line 838, in _tokens_to_video + assert f == self.num_frames, f'num_frames mismatch: config={self.num_frames}, input={f}' +AssertionError: num_frames mismatch: config=2, input=3 diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/files/requirements.txt b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9482b14c931dabad128839dda1a084515455bc70 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:44:48.713011Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625034280960" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "88g6l38gdjpfyz6jlyw7nube4byh68rn" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/files/wandb-summary.json b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6cd1bee6e3ca6dccd47bd974c513f1ce5d596373 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":46},"_runtime":46} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-core.log b/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2218bfaf282ceb51fa53df291c7df07322eca097 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:44:48.778798813Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp82iqezch/port-3172096.txt","pid":3172096,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:44:48.779229443Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3172096} +{"time":"2025-12-06T17:44:48.779243236Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3172096-3172319-1744563263/socket","Net":"unix"}} +{"time":"2025-12-06T17:44:48.964555838Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:44:48.970407286Z","level":"INFO","msg":"handleInformInit: received","streamId":"9d8awgri","id":"1(@)"} +{"time":"2025-12-06T17:44:49.140144084Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"9d8awgri","id":"1(@)"} +{"time":"2025-12-06T17:45:36.105866692Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:45:36.105926883Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:45:36.105924794Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:45:36.106028942Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:45:36.106015668Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3172096-3172319-1744563263/socket","Net":"unix"}} +{"time":"2025-12-06T17:45:36.494303965Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:45:36.494336899Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:45:36.494353461Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-internal.log b/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..950dc60964a94eb96ca0b47716730e894c18695b --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:44:48.970530508Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:44:49.139842516Z","level":"INFO","msg":"stream: created new stream","id":"9d8awgri"} +{"time":"2025-12-06T17:44:49.14004132Z","level":"INFO","msg":"handler: started","stream_id":"9d8awgri"} +{"time":"2025-12-06T17:44:49.140136801Z","level":"INFO","msg":"stream: started","id":"9d8awgri"} +{"time":"2025-12-06T17:44:49.140151497Z","level":"INFO","msg":"writer: started","stream_id":"9d8awgri"} +{"time":"2025-12-06T17:44:49.140151896Z","level":"INFO","msg":"sender: started","stream_id":"9d8awgri"} +{"time":"2025-12-06T17:45:36.105939576Z","level":"INFO","msg":"stream: closing","id":"9d8awgri"} +{"time":"2025-12-06T17:45:36.349512425Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:45:36.491190274Z","level":"INFO","msg":"handler: closed","stream_id":"9d8awgri"} +{"time":"2025-12-06T17:45:36.491329681Z","level":"INFO","msg":"sender: closed","stream_id":"9d8awgri"} +{"time":"2025-12-06T17:45:36.491338964Z","level":"INFO","msg":"stream: closed","id":"9d8awgri"} diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug.log b/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..bb491c4768831101246c069a21a74679bedfc772 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_setup.py:_flush():80] Configure stats pid to 3172096 +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug.log +2025-12-06 17:44:48,715 INFO MainThread:3172096 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_174448-9d8awgri/logs/debug-internal.log +2025-12-06 17:44:48,716 INFO MainThread:3172096 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:44:48,716 INFO MainThread:3172096 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:44:48,716 INFO MainThread:3172096 [wandb_init.py:init():889] starting backend +2025-12-06 17:44:48,964 INFO MainThread:3172096 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:44:48,968 INFO MainThread:3172096 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:44:48,970 INFO MainThread:3172096 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:44:48,974 INFO MainThread:3172096 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:44:49,466 INFO MainThread:3172096 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:44:49,575 INFO MainThread:3172096 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:44:49,575 INFO MainThread:3172096 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:44:49,575 INFO MainThread:3172096 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:44:49,575 INFO MainThread:3172096 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:44:49,578 INFO MainThread:3172096 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:44:49,579 INFO MainThread:3172096 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:45:36,106 INFO wandb-AsyncioManager-main:3172096 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:45:36,106 INFO wandb-AsyncioManager-main:3172096 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_174448-9d8awgri/run-9d8awgri.wandb b/Meissonic/wandb/run-20251206_174448-9d8awgri/run-9d8awgri.wandb new file mode 100644 index 0000000000000000000000000000000000000000..788326b0bd1ca3f1618a0c9e1f71fa5e7d78c9d6 Binary files /dev/null and b/Meissonic/wandb/run-20251206_174448-9d8awgri/run-9d8awgri.wandb differ diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/config.yaml b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0a48dccd76695858ddf1e06ff5c19e36cf5609a --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + sxpdtwnolpffdhze7xd5yadnyyuyn322: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625034321920" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-06T17:49:31.624712Z" + writerId: sxpdtwnolpffdhze7xd5yadnyyuyn322 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/output.log b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..48bc5963ac926d047055e215470e9ec59f5292e0 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/output.log @@ -0,0 +1,53 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3870.30it/s] +12/06/2025 17:49:39 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/06/2025 17:49:39 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/06/2025 17:50:04 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/06/2025 17:50:11 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/06/2025 17:50:11 - INFO - train.dataset_utils - Using decord for video loading +12/06/2025 17:50:11 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/06/2025 17:50:15 - INFO - __main__ - ***** Running training ***** +12/06/2025 17:50:15 - INFO - __main__ - Num training steps = 10000 +12/06/2025 17:50:15 - INFO - __main__ - Instantaneous batch size per device = 4 +12/06/2025 17:50:15 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/06/2025 17:50:15 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 774, in main + for batch in train_dataloader: + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 567, in __iter__ + current_batch = next(dataloader_iter) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ + data = self._next_data() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/mnt/Meissonic/train/dataset_utils.py", line 615, in __getitem__ + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) + File "/mnt/Meissonic/train/dataset_utils.py", line 303, in process_video + video_tensor.view(C * F, 1, H, W), +RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 774, in main + for batch in train_dataloader: + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 567, in __iter__ + current_batch = next(dataloader_iter) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ + data = self._next_data() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/mnt/Meissonic/train/dataset_utils.py", line 615, in __getitem__ + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) + File "/mnt/Meissonic/train/dataset_utils.py", line 303, in process_video + video_tensor.view(C * F, 1, H, W), +RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/requirements.txt b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/wandb-metadata.json b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..956ca37025f524fa49dc9edac0096ed1e2340f22 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-06T17:49:31.624712Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625034321920" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "sxpdtwnolpffdhze7xd5yadnyyuyn322" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/wandb-summary.json b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3ec09bc7401d0ad79cc5d30b398a997aeec874f5 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":43},"_runtime":43} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-core.log b/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..01ff4fa9ed17f9beb8703fc6bb3550cabe593c0c --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-06T17:49:31.689702797Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpden1zxw3/port-3174937.txt","pid":3174937,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-06T17:49:31.690259079Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3174937} +{"time":"2025-12-06T17:49:31.690247428Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3174937-3175167-2809527005/socket","Net":"unix"}} +{"time":"2025-12-06T17:49:31.876961861Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-06T17:49:31.882645991Z","level":"INFO","msg":"handleInformInit: received","streamId":"dw96vw8c","id":"1(@)"} +{"time":"2025-12-06T17:49:32.048297893Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"dw96vw8c","id":"1(@)"} +{"time":"2025-12-06T17:50:15.644829865Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-06T17:50:15.644875369Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-06T17:50:15.644893461Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-06T17:50:15.644955608Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-06T17:50:15.645008905Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3174937-3175167-2809527005/socket","Net":"unix"}} +{"time":"2025-12-06T17:50:16.149404666Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-06T17:50:16.149432223Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-06T17:50:16.149446757Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-internal.log b/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a6c7510a2fc790198ed2b988592ef0a84188d97b --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-06T17:49:31.882736065Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-06T17:49:32.048112058Z","level":"INFO","msg":"stream: created new stream","id":"dw96vw8c"} +{"time":"2025-12-06T17:49:32.048183942Z","level":"INFO","msg":"handler: started","stream_id":"dw96vw8c"} +{"time":"2025-12-06T17:49:32.048290863Z","level":"INFO","msg":"stream: started","id":"dw96vw8c"} +{"time":"2025-12-06T17:49:32.048304095Z","level":"INFO","msg":"writer: started","stream_id":"dw96vw8c"} +{"time":"2025-12-06T17:49:32.048307033Z","level":"INFO","msg":"sender: started","stream_id":"dw96vw8c"} +{"time":"2025-12-06T17:50:15.644889895Z","level":"INFO","msg":"stream: closing","id":"dw96vw8c"} +{"time":"2025-12-06T17:50:16.035790546Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-06T17:50:16.146248452Z","level":"INFO","msg":"handler: closed","stream_id":"dw96vw8c"} +{"time":"2025-12-06T17:50:16.146354679Z","level":"INFO","msg":"sender: closed","stream_id":"dw96vw8c"} +{"time":"2025-12-06T17:50:16.146364849Z","level":"INFO","msg":"stream: closed","id":"dw96vw8c"} diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug.log b/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4c255b1f31469825332f087280d0f65a6b0bdb02 --- /dev/null +++ b/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_setup.py:_flush():80] Configure stats pid to 3174937 +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug.log +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251206_174931-dw96vw8c/logs/debug-internal.log +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_init.py:init():841] calling init triggers +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-06 17:49:31,627 INFO MainThread:3174937 [wandb_init.py:init():889] starting backend +2025-12-06 17:49:31,877 INFO MainThread:3174937 [wandb_init.py:init():892] sending inform_init request +2025-12-06 17:49:31,881 INFO MainThread:3174937 [wandb_init.py:init():900] backend started and connected +2025-12-06 17:49:31,882 INFO MainThread:3174937 [wandb_init.py:init():970] updated telemetry +2025-12-06 17:49:31,886 INFO MainThread:3174937 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-06 17:49:32,291 INFO MainThread:3174937 [wandb_init.py:init():1041] starting run threads in backend +2025-12-06 17:49:32,401 INFO MainThread:3174937 [wandb_run.py:_console_start():2521] atexit reg +2025-12-06 17:49:32,402 INFO MainThread:3174937 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-06 17:49:32,402 INFO MainThread:3174937 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-06 17:49:32,402 INFO MainThread:3174937 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-06 17:49:32,405 INFO MainThread:3174937 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-06 17:49:32,406 INFO MainThread:3174937 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-06 17:50:15,645 INFO wandb-AsyncioManager-main:3174937 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-06 17:50:15,645 INFO wandb-AsyncioManager-main:3174937 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251206_174931-dw96vw8c/run-dw96vw8c.wandb b/Meissonic/wandb/run-20251206_174931-dw96vw8c/run-dw96vw8c.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a0b86c5da1be4fad286af0e909900b74f9513b3d Binary files /dev/null and b/Meissonic/wandb/run-20251206_174931-dw96vw8c/run-dw96vw8c.wandb differ diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/files/config.yaml b/Meissonic/wandb/run-20251207_081904-26in63ms/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ef2d827b97961bb92eedf6d8545c4066967ab170 --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + hhslq2ehf0jf1h9fmd51hovursg7w9ju: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625036812288" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T08:19:04.755671Z" + writerId: hhslq2ehf0jf1h9fmd51hovursg7w9ju + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/files/output.log b/Meissonic/wandb/run-20251207_081904-26in63ms/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..168e17936aef07178eeeac7623789090a51c28e9 --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/files/output.log @@ -0,0 +1,34 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7997.86it/s] +12/07/2025 08:19:12 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 08:19:12 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 08:19:36 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 08:19:43 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 08:19:43 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 08:19:43 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 08:19:46 - INFO - __main__ - ***** Running training ***** +12/07/2025 08:19:46 - INFO - __main__ - Num training steps = 10000 +12/07/2025 08:19:46 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 08:19:46 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/07/2025 08:19:46 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG-transformer] Input: tokens.shape=torch.Size([4, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([4, 512, 768]), timesteps.shape=torch.Size([4]) +[DEBUG-transformer] After conversion: len(x_list)=4, len(context_list)=4 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([4, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=4 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([4, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([4, 65537, 3, 30, 52]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 869, in main + loss = F.cross_entropy( +UnboundLocalError: local variable 'F' referenced before assignment +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 869, in main + loss = F.cross_entropy( +UnboundLocalError: local variable 'F' referenced before assignment diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/files/requirements.txt b/Meissonic/wandb/run-20251207_081904-26in63ms/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_081904-26in63ms/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2b99a3946ca17e0d0af8ee4ac81162e4a40eccda --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T08:19:04.755671Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625036812288" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "hhslq2ehf0jf1h9fmd51hovursg7w9ju" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/files/wandb-summary.json b/Meissonic/wandb/run-20251207_081904-26in63ms/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cfcb5c8fb84baf8eb1decd5892adc3acd169ba2b --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":50},"_runtime":50} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-core.log b/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ee022bd94ffe82036ef1c9f73c980569227ea7ac --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T08:19:04.820250562Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpkdfq77zh/port-3535865.txt","pid":3535865,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T08:19:04.820685844Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3535865} +{"time":"2025-12-07T08:19:04.82105163Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3535865-3536107-1233510494/socket","Net":"unix"}} +{"time":"2025-12-07T08:19:05.007364516Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T08:19:05.013054421Z","level":"INFO","msg":"handleInformInit: received","streamId":"26in63ms","id":"1(@)"} +{"time":"2025-12-07T08:19:05.184803328Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"26in63ms","id":"1(@)"} +{"time":"2025-12-07T08:19:55.635775549Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T08:19:55.635834262Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T08:19:55.635846104Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T08:19:55.63588898Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T08:19:55.635962849Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3535865-3536107-1233510494/socket","Net":"unix"}} +{"time":"2025-12-07T08:19:55.989995222Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T08:19:55.9900157Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T08:19:55.990029439Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-internal.log b/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2fc0b67ce56c745e8dcf884be748aee9d6ade9b6 --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T08:19:05.013169359Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T08:19:05.184678435Z","level":"INFO","msg":"stream: created new stream","id":"26in63ms"} +{"time":"2025-12-07T08:19:05.18473231Z","level":"INFO","msg":"handler: started","stream_id":"26in63ms"} +{"time":"2025-12-07T08:19:05.184796038Z","level":"INFO","msg":"stream: started","id":"26in63ms"} +{"time":"2025-12-07T08:19:05.18481126Z","level":"INFO","msg":"writer: started","stream_id":"26in63ms"} +{"time":"2025-12-07T08:19:05.184810662Z","level":"INFO","msg":"sender: started","stream_id":"26in63ms"} +{"time":"2025-12-07T08:19:55.635843201Z","level":"INFO","msg":"stream: closing","id":"26in63ms"} +{"time":"2025-12-07T08:19:55.881501407Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T08:19:55.985405282Z","level":"INFO","msg":"handler: closed","stream_id":"26in63ms"} +{"time":"2025-12-07T08:19:55.985487978Z","level":"INFO","msg":"sender: closed","stream_id":"26in63ms"} +{"time":"2025-12-07T08:19:55.985494997Z","level":"INFO","msg":"stream: closed","id":"26in63ms"} diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug.log b/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6c60cb70f5225b67f4e91c94f264d50c77e59039 --- /dev/null +++ b/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_setup.py:_flush():80] Configure stats pid to 3535865 +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug.log +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_081904-26in63ms/logs/debug-internal.log +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_init.py:init():841] calling init triggers +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 08:19:04,758 INFO MainThread:3535865 [wandb_init.py:init():889] starting backend +2025-12-07 08:19:05,007 INFO MainThread:3535865 [wandb_init.py:init():892] sending inform_init request +2025-12-07 08:19:05,011 INFO MainThread:3535865 [wandb_init.py:init():900] backend started and connected +2025-12-07 08:19:05,012 INFO MainThread:3535865 [wandb_init.py:init():970] updated telemetry +2025-12-07 08:19:05,016 INFO MainThread:3535865 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 08:19:05,374 INFO MainThread:3535865 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 08:19:05,483 INFO MainThread:3535865 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 08:19:05,483 INFO MainThread:3535865 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 08:19:05,483 INFO MainThread:3535865 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 08:19:05,483 INFO MainThread:3535865 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 08:19:05,486 INFO MainThread:3535865 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 08:19:05,487 INFO MainThread:3535865 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 08:19:55,635 INFO wandb-AsyncioManager-main:3535865 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 08:19:55,635 INFO wandb-AsyncioManager-main:3535865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_081904-26in63ms/run-26in63ms.wandb b/Meissonic/wandb/run-20251207_081904-26in63ms/run-26in63ms.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1ddf1d3ccd6728f1402e872db236c12f5bdffbe6 Binary files /dev/null and b/Meissonic/wandb/run-20251207_081904-26in63ms/run-26in63ms.wandb differ diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/config.yaml b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2c969eb1b3be60591f0468f36acc79a63bc22e6 --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + za20x08jeyip9hiff8kmov7ccuiqswcr: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625036992512" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T08:21:57.018614Z" + writerId: za20x08jeyip9hiff8kmov7ccuiqswcr + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/output.log b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..82de3ee3d3e217bc6a39730ebd25ccba3bd33253 --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/output.log @@ -0,0 +1,38 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8340.95it/s] +12/07/2025 08:22:04 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 08:22:04 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 08:22:28 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 08:22:35 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 08:22:35 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 08:22:35 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 08:22:39 - INFO - __main__ - ***** Running training ***** +12/07/2025 08:22:39 - INFO - __main__ - Num training steps = 10000 +12/07/2025 08:22:39 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 08:22:39 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/07/2025 08:22:39 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG-transformer] Input: tokens.shape=torch.Size([4, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([4, 512, 768]), timesteps.shape=torch.Size([4]) +[DEBUG-transformer] After conversion: len(x_list)=4, len(context_list)=4 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([4, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=4 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([4, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([4, 65537, 3, 30, 52]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 869, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +ValueError: Expected input batch_size (18720) to match target batch_size (19080). +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 869, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +ValueError: Expected input batch_size (18720) to match target batch_size (19080). diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/requirements.txt b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a2d64bba9559ff55a21faad70bbb89e1b5194e05 --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T08:21:57.018614Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625036992512" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "za20x08jeyip9hiff8kmov7ccuiqswcr" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/wandb-summary.json b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..efa8182429e082366208a948311d46664f4b2c76 --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":48},"_runtime":48} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-core.log b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..22d74af713e276510b6d4415d350ff251c25295f --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T08:21:57.082525248Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpn_0o2vks/port-3538182.txt","pid":3538182,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T08:21:57.082958885Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3538182} +{"time":"2025-12-07T08:21:57.082973769Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3538182-3538417-2609847882/socket","Net":"unix"}} +{"time":"2025-12-07T08:21:57.269964588Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T08:21:57.275303637Z","level":"INFO","msg":"handleInformInit: received","streamId":"ntg4ic1v","id":"1(@)"} +{"time":"2025-12-07T08:21:57.44096558Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ntg4ic1v","id":"1(@)"} +{"time":"2025-12-07T08:22:46.417604272Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T08:22:46.41765507Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T08:22:46.417717814Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T08:22:46.417669362Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T08:22:46.417804498Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3538182-3538417-2609847882/socket","Net":"unix"}} +{"time":"2025-12-07T08:22:46.83115426Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T08:22:46.831174001Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T08:22:46.831186191Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-internal.log b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..bae28a62792f22503d3046ef0cabf489016b0e9b --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T08:21:57.275386402Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T08:21:57.440817166Z","level":"INFO","msg":"stream: created new stream","id":"ntg4ic1v"} +{"time":"2025-12-07T08:21:57.440899478Z","level":"INFO","msg":"handler: started","stream_id":"ntg4ic1v"} +{"time":"2025-12-07T08:21:57.440958114Z","level":"INFO","msg":"stream: started","id":"ntg4ic1v"} +{"time":"2025-12-07T08:21:57.440973516Z","level":"INFO","msg":"writer: started","stream_id":"ntg4ic1v"} +{"time":"2025-12-07T08:21:57.440975564Z","level":"INFO","msg":"sender: started","stream_id":"ntg4ic1v"} +{"time":"2025-12-07T08:22:46.417679941Z","level":"INFO","msg":"stream: closing","id":"ntg4ic1v"} +{"time":"2025-12-07T08:22:46.704077477Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T08:22:46.828062017Z","level":"INFO","msg":"handler: closed","stream_id":"ntg4ic1v"} +{"time":"2025-12-07T08:22:46.828147815Z","level":"INFO","msg":"sender: closed","stream_id":"ntg4ic1v"} +{"time":"2025-12-07T08:22:46.828155125Z","level":"INFO","msg":"stream: closed","id":"ntg4ic1v"} diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug.log b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f9efc2fd3e7cb212e9d7142b3c0917edc1aeb03c --- /dev/null +++ b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_setup.py:_flush():80] Configure stats pid to 3538182 +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug.log +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_082157-ntg4ic1v/logs/debug-internal.log +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_init.py:init():841] calling init triggers +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 08:21:57,021 INFO MainThread:3538182 [wandb_init.py:init():889] starting backend +2025-12-07 08:21:57,270 INFO MainThread:3538182 [wandb_init.py:init():892] sending inform_init request +2025-12-07 08:21:57,273 INFO MainThread:3538182 [wandb_init.py:init():900] backend started and connected +2025-12-07 08:21:57,275 INFO MainThread:3538182 [wandb_init.py:init():970] updated telemetry +2025-12-07 08:21:57,279 INFO MainThread:3538182 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 08:21:57,629 INFO MainThread:3538182 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 08:21:57,741 INFO MainThread:3538182 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 08:21:57,741 INFO MainThread:3538182 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 08:21:57,741 INFO MainThread:3538182 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 08:21:57,741 INFO MainThread:3538182 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 08:21:57,745 INFO MainThread:3538182 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 08:21:57,745 INFO MainThread:3538182 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 08:22:46,417 INFO wandb-AsyncioManager-main:3538182 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 08:22:46,417 INFO wandb-AsyncioManager-main:3538182 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_082157-ntg4ic1v/run-ntg4ic1v.wandb b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/run-ntg4ic1v.wandb new file mode 100644 index 0000000000000000000000000000000000000000..953867323abb9c384c257583f356005c488052b8 Binary files /dev/null and b/Meissonic/wandb/run-20251207_082157-ntg4ic1v/run-ntg4ic1v.wandb differ diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/config.yaml b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4d0bdbde2e1f693ada3e9ea06d7af2c67f49ff5b --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 12zvy4ynse8zgviyk22mny7afmqeloby: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625038360576" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:18:59.282936Z" + writerId: 12zvy4ynse8zgviyk22mny7afmqeloby + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/output.log b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3624250b3b8c5df8128bbe3b5f32891af27ae3cd --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/output.log @@ -0,0 +1,38 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3612.22it/s] +12/07/2025 09:19:06 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:19:06 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:19:32 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:19:39 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:19:39 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:19:39 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:19:43 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:19:43 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:19:43 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 09:19:43 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/07/2025 09:19:43 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG-transformer] Input: tokens.shape=torch.Size([4, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([4, 512, 768]), timesteps.shape=torch.Size([4]) +[DEBUG-transformer] After conversion: len(x_list)=4, len(context_list)=4 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([4, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=4 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([4, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([4, 65537, 3, 30, 52]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 869, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +ValueError: Expected input batch_size (18720) to match target batch_size (19080). +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1027, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 869, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +ValueError: Expected input batch_size (18720) to match target batch_size (19080). diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/requirements.txt b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3860d433e93b6fde423026f9846f51be479b5c5e --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:18:59.282936Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625038360576" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "12zvy4ynse8zgviyk22mny7afmqeloby" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/wandb-summary.json b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cfcb5c8fb84baf8eb1decd5892adc3acd169ba2b --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":50},"_runtime":50} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-core.log b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2fd0804d922c526acd2adfd0a3ef05b2f8ba18c6 --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:18:59.349539035Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpkhaf8iax/port-3561952.txt","pid":3561952,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:18:59.349991702Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3561952} +{"time":"2025-12-07T09:18:59.349994887Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3561952-3562110-3268390799/socket","Net":"unix"}} +{"time":"2025-12-07T09:18:59.534522023Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:18:59.540742391Z","level":"INFO","msg":"handleInformInit: received","streamId":"ohf6qu5w","id":"1(@)"} +{"time":"2025-12-07T09:18:59.706170442Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ohf6qu5w","id":"1(@)"} +{"time":"2025-12-07T09:19:50.432029358Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:19:50.43209084Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:19:50.432083214Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:19:50.43219541Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3561952-3562110-3268390799/socket","Net":"unix"}} +{"time":"2025-12-07T09:19:50.432228017Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:19:50.861875098Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:19:50.861905603Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:19:50.861926446Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-internal.log b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c913fb355d4f91d829a7f38ffa15833c0b8b50a7 --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:18:59.540831005Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:18:59.7059592Z","level":"INFO","msg":"stream: created new stream","id":"ohf6qu5w"} +{"time":"2025-12-07T09:18:59.706081849Z","level":"INFO","msg":"handler: started","stream_id":"ohf6qu5w"} +{"time":"2025-12-07T09:18:59.706163709Z","level":"INFO","msg":"stream: started","id":"ohf6qu5w"} +{"time":"2025-12-07T09:18:59.70618647Z","level":"INFO","msg":"sender: started","stream_id":"ohf6qu5w"} +{"time":"2025-12-07T09:18:59.706191416Z","level":"INFO","msg":"writer: started","stream_id":"ohf6qu5w"} +{"time":"2025-12-07T09:19:50.432106973Z","level":"INFO","msg":"stream: closing","id":"ohf6qu5w"} +{"time":"2025-12-07T09:19:50.699406989Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:19:50.858902779Z","level":"INFO","msg":"handler: closed","stream_id":"ohf6qu5w"} +{"time":"2025-12-07T09:19:50.858998821Z","level":"INFO","msg":"sender: closed","stream_id":"ohf6qu5w"} +{"time":"2025-12-07T09:19:50.859007075Z","level":"INFO","msg":"stream: closed","id":"ohf6qu5w"} diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug.log b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7faee2b5ef9503a97cf613870082c491780839cc --- /dev/null +++ b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_setup.py:_flush():80] Configure stats pid to 3561952 +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug.log +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_091859-ohf6qu5w/logs/debug-internal.log +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:18:59,285 INFO MainThread:3561952 [wandb_init.py:init():889] starting backend +2025-12-07 09:18:59,534 INFO MainThread:3561952 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:18:59,538 INFO MainThread:3561952 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:18:59,539 INFO MainThread:3561952 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:18:59,543 INFO MainThread:3561952 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:18:59,923 INFO MainThread:3561952 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:19:00,030 INFO MainThread:3561952 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:19:00,030 INFO MainThread:3561952 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:19:00,030 INFO MainThread:3561952 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:19:00,030 INFO MainThread:3561952 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:19:00,033 INFO MainThread:3561952 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:19:00,034 INFO MainThread:3561952 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 09:19:50,432 INFO wandb-AsyncioManager-main:3561952 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:19:50,432 INFO wandb-AsyncioManager-main:3561952 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_091859-ohf6qu5w/run-ohf6qu5w.wandb b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/run-ohf6qu5w.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d4e4925e01b12e9ad1d7d5c3e024075a87e717db Binary files /dev/null and b/Meissonic/wandb/run-20251207_091859-ohf6qu5w/run-ohf6qu5w.wandb differ diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/files/config.yaml b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..988220344944abbdad6f9afb74f085627cca6a11 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + fezdpopul5h7idzonkh44sxzul6ux92z: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625038528512" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:22:30.911002Z" + writerId: fezdpopul5h7idzonkh44sxzul6ux92z + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/files/output.log b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..c8f289506f0e7ecdce5a1f5608759524cdf3bf0e --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/output.log @@ -0,0 +1,34 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4145.15it/s] +12/07/2025 09:22:38 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:22:38 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:23:03 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:23:11 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:23:11 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:23:11 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:23:14 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:23:14 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:23:14 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 09:23:14 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/07/2025 09:23:14 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG-transformer] Input: tokens.shape=torch.Size([4, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([4, 512, 768]), timesteps.shape=torch.Size([4]) +[DEBUG-transformer] After conversion: len(x_list)=4, len(context_list)=4 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([4, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=4 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([4, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([4, 65537, 3, 30, 52]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 878, in main + labels_flat = labels.view(-1) +RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 878, in main + labels_flat = labels.view(-1) +RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/files/requirements.txt b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9499a49a84efb6b9c073d0a5d26517275d39cfee --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:22:30.911002Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625038528512" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "fezdpopul5h7idzonkh44sxzul6ux92z" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/files/wandb-summary.json b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cfcb5c8fb84baf8eb1decd5892adc3acd169ba2b --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":50},"_runtime":50} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-core.log b/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f1d731ab575b2617a0a7ab8f7a272c2072a7f480 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:22:30.977231143Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpougs9edx/port-3562969.txt","pid":3562969,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:22:30.977768035Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3562969} +{"time":"2025-12-07T09:22:30.977745639Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3562969-3563146-3327338051/socket","Net":"unix"}} +{"time":"2025-12-07T09:22:31.1643071Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:22:31.169990002Z","level":"INFO","msg":"handleInformInit: received","streamId":"abaot6f6","id":"1(@)"} +{"time":"2025-12-07T09:22:31.336295538Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"abaot6f6","id":"1(@)"} +{"time":"2025-12-07T09:23:22.451115724Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:23:22.451161439Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:23:22.451214102Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:23:22.451172614Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:23:22.4513042Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3562969-3563146-3327338051/socket","Net":"unix"}} +{"time":"2025-12-07T09:23:22.852429703Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:23:22.852454356Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:23:22.852463907Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-internal.log b/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b0c4b92debac27f213547fb32e7c74025e17fac1 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:22:31.170136939Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:22:31.336121952Z","level":"INFO","msg":"stream: created new stream","id":"abaot6f6"} +{"time":"2025-12-07T09:22:31.336192335Z","level":"INFO","msg":"handler: started","stream_id":"abaot6f6"} +{"time":"2025-12-07T09:22:31.336288523Z","level":"INFO","msg":"stream: started","id":"abaot6f6"} +{"time":"2025-12-07T09:22:31.33630291Z","level":"INFO","msg":"writer: started","stream_id":"abaot6f6"} +{"time":"2025-12-07T09:22:31.336303413Z","level":"INFO","msg":"sender: started","stream_id":"abaot6f6"} +{"time":"2025-12-07T09:23:22.451180342Z","level":"INFO","msg":"stream: closing","id":"abaot6f6"} +{"time":"2025-12-07T09:23:22.720890333Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:23:22.849305035Z","level":"INFO","msg":"handler: closed","stream_id":"abaot6f6"} +{"time":"2025-12-07T09:23:22.849398007Z","level":"INFO","msg":"sender: closed","stream_id":"abaot6f6"} +{"time":"2025-12-07T09:23:22.849406506Z","level":"INFO","msg":"stream: closed","id":"abaot6f6"} diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug.log b/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..66a1d70ad0204b94af7bfa026c74564d174505c7 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:22:30,913 INFO MainThread:3562969 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_setup.py:_flush():80] Configure stats pid to 3562969 +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug.log +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_092230-abaot6f6/logs/debug-internal.log +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:22:30,914 INFO MainThread:3562969 [wandb_init.py:init():889] starting backend +2025-12-07 09:22:31,164 INFO MainThread:3562969 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:22:31,168 INFO MainThread:3562969 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:22:31,169 INFO MainThread:3562969 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:22:31,173 INFO MainThread:3562969 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:22:31,546 INFO MainThread:3562969 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:22:31,655 INFO MainThread:3562969 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:22:31,655 INFO MainThread:3562969 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:22:31,655 INFO MainThread:3562969 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:22:31,655 INFO MainThread:3562969 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:22:31,659 INFO MainThread:3562969 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:22:31,659 INFO MainThread:3562969 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 09:23:22,451 INFO wandb-AsyncioManager-main:3562969 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:23:22,451 INFO wandb-AsyncioManager-main:3562969 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_092230-abaot6f6/run-abaot6f6.wandb b/Meissonic/wandb/run-20251207_092230-abaot6f6/run-abaot6f6.wandb new file mode 100644 index 0000000000000000000000000000000000000000..41f6970e18e95f416e57a4f09689fa247455c245 Binary files /dev/null and b/Meissonic/wandb/run-20251207_092230-abaot6f6/run-abaot6f6.wandb differ diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/files/config.yaml b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..56b7eb0471121941caeb108556242cafdb2c581a --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + zs0pstxr14tsx3iyic4t3epf9cjkoxfh: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625038692352" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:24:40.932234Z" + writerId: zs0pstxr14tsx3iyic4t3epf9cjkoxfh + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/files/output.log b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7a4fcfe2e828069bec5a767730ca046c7009d5da --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/output.log @@ -0,0 +1,98 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10381.94it/s] +12/07/2025 09:24:48 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:24:48 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:25:12 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:25:19 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:25:19 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:25:19 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:25:22 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:25:22 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:25:22 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 09:25:22 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4 +12/07/2025 09:25:22 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG-transformer] Input: tokens.shape=torch.Size([4, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([4, 512, 768]), timesteps.shape=torch.Size([4]) +[DEBUG-transformer] After conversion: len(x_list)=4, len(context_list)=4 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([4, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=4 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([4, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([4, 65537, 3, 30, 52]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 891, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook + _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn + fn(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 940, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 670, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 419, in forward + e = (self.modulation.unsqueeze(0) + e).chunk(6, dim=2) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 220.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 91.56 MiB is free. Including non-PyTorch memory, this process has 39.40 GiB memory in use. Of the allocated memory 38.53 GiB is allocated by PyTorch, and 361.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 891, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook + _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn + fn(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 940, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 670, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 419, in forward + e = (self.modulation.unsqueeze(0) + e).chunk(6, dim=2) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 220.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 91.56 MiB is free. Including non-PyTorch memory, this process has 39.40 GiB memory in use. Of the allocated memory 38.53 GiB is allocated by PyTorch, and 361.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/files/requirements.txt b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..84028b0a7ec77fbab7c609f8c0390ced326e510f --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:24:40.932234Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625038692352" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "zs0pstxr14tsx3iyic4t3epf9cjkoxfh" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/files/wandb-summary.json b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cfcb5c8fb84baf8eb1decd5892adc3acd169ba2b --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":50},"_runtime":50} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-core.log b/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..80b9702507d46b7845db41d23f9d9ae63d080a7a --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:24:41.000651805Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmplul4cgzk/port-3563917.txt","pid":3563917,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:24:41.001165273Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3563917} +{"time":"2025-12-07T09:24:41.001154793Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3563917-3564161-963263980/socket","Net":"unix"}} +{"time":"2025-12-07T09:24:41.187039446Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:24:41.192569384Z","level":"INFO","msg":"handleInformInit: received","streamId":"ov185qc7","id":"1(@)"} +{"time":"2025-12-07T09:24:41.362418288Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ov185qc7","id":"1(@)"} +{"time":"2025-12-07T09:25:31.755237055Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:25:31.75529784Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:25:31.75537086Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:25:31.755321302Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:25:31.755454446Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3563917-3564161-963263980/socket","Net":"unix"}} +{"time":"2025-12-07T09:25:32.104592204Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:25:32.104620157Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:25:32.10463384Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-internal.log b/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..834d76666094fa8e7fb442849395f4e030280e25 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:24:41.192722256Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:24:41.362251763Z","level":"INFO","msg":"stream: created new stream","id":"ov185qc7"} +{"time":"2025-12-07T09:24:41.362310313Z","level":"INFO","msg":"handler: started","stream_id":"ov185qc7"} +{"time":"2025-12-07T09:24:41.362411079Z","level":"INFO","msg":"stream: started","id":"ov185qc7"} +{"time":"2025-12-07T09:24:41.362426313Z","level":"INFO","msg":"writer: started","stream_id":"ov185qc7"} +{"time":"2025-12-07T09:24:41.362429878Z","level":"INFO","msg":"sender: started","stream_id":"ov185qc7"} +{"time":"2025-12-07T09:25:31.755308835Z","level":"INFO","msg":"stream: closing","id":"ov185qc7"} +{"time":"2025-12-07T09:25:31.995829835Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:25:32.101330546Z","level":"INFO","msg":"handler: closed","stream_id":"ov185qc7"} +{"time":"2025-12-07T09:25:32.101424984Z","level":"INFO","msg":"sender: closed","stream_id":"ov185qc7"} +{"time":"2025-12-07T09:25:32.101434814Z","level":"INFO","msg":"stream: closed","id":"ov185qc7"} diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug.log b/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d3cbdbb2b583dfc4295b604a0b3cf7cea162bd7e --- /dev/null +++ b/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_setup.py:_flush():80] Configure stats pid to 3563917 +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug.log +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_092440-ov185qc7/logs/debug-internal.log +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:24:40,935 INFO MainThread:3563917 [wandb_init.py:init():889] starting backend +2025-12-07 09:24:41,187 INFO MainThread:3563917 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:24:41,191 INFO MainThread:3563917 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:24:41,192 INFO MainThread:3563917 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:24:41,196 INFO MainThread:3563917 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:24:41,549 INFO MainThread:3563917 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:24:41,659 INFO MainThread:3563917 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:24:41,659 INFO MainThread:3563917 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:24:41,659 INFO MainThread:3563917 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:24:41,659 INFO MainThread:3563917 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:24:41,663 INFO MainThread:3563917 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:24:41,663 INFO MainThread:3563917 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 09:25:31,755 INFO wandb-AsyncioManager-main:3563917 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:25:31,755 INFO wandb-AsyncioManager-main:3563917 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_092440-ov185qc7/run-ov185qc7.wandb b/Meissonic/wandb/run-20251207_092440-ov185qc7/run-ov185qc7.wandb new file mode 100644 index 0000000000000000000000000000000000000000..03a0a8ad7a3f3814d508c3c9f7e61d88193c8ba2 Binary files /dev/null and b/Meissonic/wandb/run-20251207_092440-ov185qc7/run-ov185qc7.wandb differ diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/config.yaml b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5bb284aa42887c7896f472c678993ac69dd83e84 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/config.yaml @@ -0,0 +1,282 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + lle2brcylg0uou8g0xrp4jqlmk7jgekz: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625038815232" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:25:54.228730Z" + writerId: lle2brcylg0uou8g0xrp4jqlmk7jgekz + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/output.log b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b7b02124e85f29c0cfecb8d4ed2ba224c47bf737 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/output.log @@ -0,0 +1,1699 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5195.56it/s] +12/07/2025 09:26:01 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:26:01 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:26:26 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:26:33 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:26:33 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:26:33 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:26:37 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:26:37 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:26:37 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 09:26:37 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 1 +12/07/2025 09:26:37 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +12/07/2025 09:27:45 - INFO - __main__ - Step: 50 Loss: 11.0814 LR: 0.000300 +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +12/07/2025 09:28:45 - INFO - __main__ - Step: 100 Loss: 10.9092 LR: 0.000300 +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +12/07/2025 09:29:45 - INFO - __main__ - Step: 150 Loss: 10.8302 LR: 0.000300 +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +[DEBUG-transformer] After backbone: len(out_list)=1 +[DEBUG-transformer] out_list[0].shape=torch.Size([16, 3, 30, 52]) +[DEBUG-transformer] After stack: vids.shape=torch.Size([1, 16, 3, 30, 52]) +[DEBUG-transformer] Final logits.shape=torch.Size([1, 65537, 3, 30, 52]) +[DEBUG-transformer] Input: tokens.shape=torch.Size([1, 3, 30, 53]), encoder_hidden_states.shape=torch.Size([1, 512, 768]), timesteps.shape=torch.Size([1]) +[DEBUG-transformer] After conversion: len(x_list)=1, len(context_list)=1 +[DEBUG-transformer] x_list[0].shape=torch.Size([16, 3, 30, 53]) +[DEBUG-transformer] context_list[0].shape=torch.Size([512, 768]) +[DEBUG-transformer] t_model.shape=torch.Size([1, 1170]) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 855, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 893, in forward + ``H_out = height // 2`` and ``W_out = width // 2``. + File "/mnt/Meissonic/src/transformer_video.py", line 896, in torch_dynamo_resume_in_forward_at_893 + if DEBUG_TRANSFORMER: + File "/mnt/Meissonic/src/transformer_video.py", line 898, in torch_dynamo_resume_in_forward_at_896 + x_list = self._tokens_to_video(tokens) + File "/mnt/Meissonic/src/transformer_video.py", line 900, in torch_dynamo_resume_in_forward_at_898 + if DEBUG_TRANSFORMER: + File "/mnt/Meissonic/src/transformer_video.py", line 927, in torch_dynamo_resume_in_forward_at_900 + else: + File "/mnt/Meissonic/src/transformer_video.py", line 945, in torch_dynamo_resume_in_forward_at_927 + x_in, t_in, context_in, seq_len_in, y_in = inputs + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 940, in custom_forward + # y: Optional[List[Tensor]] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 670, in forward + context_lens=context_lens) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 423, in forward + assert e[0].dtype == torch.float32 + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 323, in forward + v = self.v(x).view(b, s, n, d) + File "/mnt/Meissonic/src/transformer_video.py", line 319, in qkv_fn + # query, key, value function + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 262, in forward + Args: + File "/mnt/Meissonic/src/transformer_video.py", line 265, in _norm + return self._norm(x.float()).type_as(x) * self.weight + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1129, in pack_hook + def pack_hook(x): +KeyboardInterrupt +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 855, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 893, in forward + ``H_out = height // 2`` and ``W_out = width // 2``. + File "/mnt/Meissonic/src/transformer_video.py", line 896, in torch_dynamo_resume_in_forward_at_893 + if DEBUG_TRANSFORMER: + File "/mnt/Meissonic/src/transformer_video.py", line 898, in torch_dynamo_resume_in_forward_at_896 + x_list = self._tokens_to_video(tokens) + File "/mnt/Meissonic/src/transformer_video.py", line 900, in torch_dynamo_resume_in_forward_at_898 + if DEBUG_TRANSFORMER: + File "/mnt/Meissonic/src/transformer_video.py", line 927, in torch_dynamo_resume_in_forward_at_900 + else: + File "/mnt/Meissonic/src/transformer_video.py", line 945, in torch_dynamo_resume_in_forward_at_927 + x_in, t_in, context_in, seq_len_in, y_in = inputs + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 940, in custom_forward + # y: Optional[List[Tensor]] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 670, in forward + context_lens=context_lens) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 423, in forward + assert e[0].dtype == torch.float32 + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 323, in forward + v = self.v(x).view(b, s, n, d) + File "/mnt/Meissonic/src/transformer_video.py", line 319, in qkv_fn + # query, key, value function + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 262, in forward + Args: + File "/mnt/Meissonic/src/transformer_video.py", line 265, in _norm + return self._norm(x.float()).type_as(x) * self.weight + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1129, in pack_hook + def pack_hook(x): +KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/requirements.txt b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4570fa6b0f0bf7da86531d4495680f7765d3c228 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:25:54.228730Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625038815232" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "lle2brcylg0uou8g0xrp4jqlmk7jgekz" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/wandb-summary.json b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..87efdcd837b16ff105dece8449bcbba407302821 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":256.243764978,"_step":150,"step_loss":10.83018684387207,"lr":0.0003,"avg_masking_rate":0.9993425607681274,"_timestamp":1.765099785710869e+09,"_wandb":{"runtime":256}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-core.log b/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..bcd2b7dadde3677bbfc5d29cd65ee15db90951eb --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:25:54.294211556Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpogv5_evb/port-3565008.txt","pid":3565008,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:25:54.294699193Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3565008} +{"time":"2025-12-07T09:25:54.294677395Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3565008-3565184-3500189253/socket","Net":"unix"}} +{"time":"2025-12-07T09:25:54.480471749Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:25:54.486073967Z","level":"INFO","msg":"handleInformInit: received","streamId":"l16v7o9l","id":"1(@)"} +{"time":"2025-12-07T09:25:54.649541234Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"l16v7o9l","id":"1(@)"} +{"time":"2025-12-07T09:30:11.123074114Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:30:11.123137613Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:30:11.123131063Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:30:11.123204327Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:30:11.123298142Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3565008-3565184-3500189253/socket","Net":"unix"}} +{"time":"2025-12-07T09:30:11.479778384Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:30:11.479807182Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:30:11.47982061Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-internal.log b/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d2b302667ca494f4b476b5ef5c9ba71a66a05aaf --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:25:54.486208037Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:25:54.649364865Z","level":"INFO","msg":"stream: created new stream","id":"l16v7o9l"} +{"time":"2025-12-07T09:25:54.649450706Z","level":"INFO","msg":"handler: started","stream_id":"l16v7o9l"} +{"time":"2025-12-07T09:25:54.6495338Z","level":"INFO","msg":"stream: started","id":"l16v7o9l"} +{"time":"2025-12-07T09:25:54.649561809Z","level":"INFO","msg":"sender: started","stream_id":"l16v7o9l"} +{"time":"2025-12-07T09:25:54.649569006Z","level":"INFO","msg":"writer: started","stream_id":"l16v7o9l"} +{"time":"2025-12-07T09:30:11.123137253Z","level":"INFO","msg":"stream: closing","id":"l16v7o9l"} +{"time":"2025-12-07T09:30:11.373367049Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:30:11.476385653Z","level":"INFO","msg":"handler: closed","stream_id":"l16v7o9l"} +{"time":"2025-12-07T09:30:11.476482209Z","level":"INFO","msg":"sender: closed","stream_id":"l16v7o9l"} +{"time":"2025-12-07T09:30:11.476494103Z","level":"INFO","msg":"stream: closed","id":"l16v7o9l"} diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug.log b/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9311cfffda17f29cf3d30cd46492ad3f0f8a5d89 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug.log @@ -0,0 +1,76 @@ +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_setup.py:_flush():80] Configure stats pid to 3565008 +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug.log +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_092554-l16v7o9l/logs/debug-internal.log +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:25:54,231 INFO MainThread:3565008 [wandb_init.py:init():889] starting backend +2025-12-07 09:25:54,480 INFO MainThread:3565008 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:25:54,484 INFO MainThread:3565008 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:25:54,485 INFO MainThread:3565008 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:25:54,489 INFO MainThread:3565008 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:25:54,878 INFO MainThread:3565008 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:25:54,990 INFO MainThread:3565008 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:25:54,990 INFO MainThread:3565008 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:25:54,990 INFO MainThread:3565008 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:25:54,991 INFO MainThread:3565008 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:25:54,994 INFO MainThread:3565008 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:25:54,995 INFO MainThread:3565008 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 09:30:11,122 INFO wandb-AsyncioManager-main:3565008 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:30:11,123 INFO wandb-AsyncioManager-main:3565008 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. +2025-12-07 09:30:11,739 ERROR wandb-AsyncioManager-main:3565008 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback. +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions + await fn() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish + await self._send_server_request(request) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request + await self._writer.drain() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 371, in drain + await self._protocol._drain_helper() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 167, in _drain_helper + raise ConnectionResetError('Connection lost') +ConnectionResetError: Connection lost +2025-12-07 09:30:11,742 ERROR wandb-AsyncioManager-main:3565008 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback. +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions + await fn() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish + await self._send_server_request(request) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request + await self._writer.drain() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 371, in drain + await self._protocol._drain_helper() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 167, in _drain_helper + raise ConnectionResetError('Connection lost') +ConnectionResetError: Connection lost +2025-12-07 09:30:11,743 ERROR wandb-AsyncioManager-main:3565008 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback. +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions + await fn() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish + await self._send_server_request(request) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request + await self._writer.drain() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 371, in drain + await self._protocol._drain_helper() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 167, in _drain_helper + raise ConnectionResetError('Connection lost') +ConnectionResetError: Connection lost +2025-12-07 09:30:11,762 ERROR wandb-AsyncioManager-main:3565008 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback. +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions + await fn() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish + await self._send_server_request(request) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request + await self._writer.drain() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 371, in drain + await self._protocol._drain_helper() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 167, in _drain_helper + raise ConnectionResetError('Connection lost') +ConnectionResetError: Connection lost diff --git a/Meissonic/wandb/run-20251207_092554-l16v7o9l/run-l16v7o9l.wandb b/Meissonic/wandb/run-20251207_092554-l16v7o9l/run-l16v7o9l.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d8f21a0b015113f04eefcad6dce6231365a56ff0 --- /dev/null +++ b/Meissonic/wandb/run-20251207_092554-l16v7o9l/run-l16v7o9l.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a54b264a671a64b57ffde1c8e8418ac703f0d6846c8da4402d026d02f6d1a1b4 +size 357942 diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/config.yaml b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cfeba342e39e95795450190f8bbd7b3840dac432 --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/config.yaml @@ -0,0 +1,280 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + hrn9v0wjge2pmdr6a82aavw7enahb9ky: + args: + - --text_encoder_architecture + - umt5-base + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625039478784" + email: catherchen77@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:31:18.250585Z" + writerId: hrn9v0wjge2pmdr6a82aavw7enahb9ky + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/output.log b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..dd7eca1a3f6af7745c6b3abd1da01a0dc31a9f4f --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/output.log @@ -0,0 +1,33 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9416.33it/s] +12/07/2025 09:31:26 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:31:26 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:31:50 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:31:58 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:31:58 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:31:58 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:32:03 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:32:03 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:32:03 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 09:32:03 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 09:32:03 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 849, in main + vocab_size = accelerator.unwrap_model(model).vocab_size + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 3246, in unwrap_model + return extract_model_from_parallel(model, keep_fp32_wrapper, keep_torch_compile) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/other.py", line 248, in extract_model_from_parallel + model = model.__dict__["_orig_mod"] +KeyError: '_orig_mod' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1038, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 849, in main +[rank0]: vocab_size = accelerator.unwrap_model(model).vocab_size +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 3246, in unwrap_model +[rank0]: return extract_model_from_parallel(model, keep_fp32_wrapper, keep_torch_compile) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/other.py", line 248, in extract_model_from_parallel +[rank0]: model = model.__dict__["_orig_mod"] +[rank0]: KeyError: '_orig_mod' diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/requirements.txt b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8e80d09a907b39a4649483874c31162e6e07cc45 --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/wandb-metadata.json @@ -0,0 +1,147 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:31:18.250585Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "catherchen77@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625039478784" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "hrn9v0wjge2pmdr6a82aavw7enahb9ky" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/wandb-summary.json b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6522728d094978444863b741099b8461af7feae2 --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":49,"_wandb":{"runtime":49}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-core.log b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..8bbe1ce761f3519540d69ac2aad3c8924085f89c --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:31:18.3195297Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp8nd70pb7/port-3574822.txt","pid":3574822,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:31:18.320022574Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3574822} +{"time":"2025-12-07T09:31:18.32002627Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3574822-3575003-2692958643/socket","Net":"unix"}} +{"time":"2025-12-07T09:31:18.50629049Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:31:18.512397205Z","level":"INFO","msg":"handleInformInit: received","streamId":"cbtmmkh2","id":"1(@)"} +{"time":"2025-12-07T09:31:18.682370128Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"cbtmmkh2","id":"1(@)"} +{"time":"2025-12-07T09:32:07.979054574Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:32:07.979119149Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:32:07.979137263Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:32:07.979168316Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:32:07.979244844Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3574822-3575003-2692958643/socket","Net":"unix"}} +{"time":"2025-12-07T09:32:08.370246385Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:32:08.370278133Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:32:08.370290896Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-internal.log b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..aeef46059917ec45d72fc74549a4313a80c4802e --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:31:18.512500548Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:31:18.682164916Z","level":"INFO","msg":"stream: created new stream","id":"cbtmmkh2"} +{"time":"2025-12-07T09:31:18.682286191Z","level":"INFO","msg":"handler: started","stream_id":"cbtmmkh2"} +{"time":"2025-12-07T09:31:18.682362356Z","level":"INFO","msg":"stream: started","id":"cbtmmkh2"} +{"time":"2025-12-07T09:31:18.682365101Z","level":"INFO","msg":"writer: started","stream_id":"cbtmmkh2"} +{"time":"2025-12-07T09:31:18.682379815Z","level":"INFO","msg":"sender: started","stream_id":"cbtmmkh2"} +{"time":"2025-12-07T09:32:07.979141565Z","level":"INFO","msg":"stream: closing","id":"cbtmmkh2"} +{"time":"2025-12-07T09:32:08.219811941Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:32:08.366440065Z","level":"INFO","msg":"handler: closed","stream_id":"cbtmmkh2"} +{"time":"2025-12-07T09:32:08.366540261Z","level":"INFO","msg":"sender: closed","stream_id":"cbtmmkh2"} +{"time":"2025-12-07T09:32:08.366547342Z","level":"INFO","msg":"stream: closed","id":"cbtmmkh2"} diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug.log b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..edc205db76a4be1a069875d55d4db956993109a2 --- /dev/null +++ b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_setup.py:_flush():80] Configure stats pid to 3574822 +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug.log +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_093118-cbtmmkh2/logs/debug-internal.log +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:31:18,253 INFO MainThread:3574822 [wandb_init.py:init():889] starting backend +2025-12-07 09:31:18,506 INFO MainThread:3574822 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:31:18,510 INFO MainThread:3574822 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:31:18,512 INFO MainThread:3574822 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:31:18,516 INFO MainThread:3574822 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:31:18,921 INFO MainThread:3574822 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:31:19,034 INFO MainThread:3574822 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:31:19,034 INFO MainThread:3574822 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:31:19,034 INFO MainThread:3574822 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:31:19,034 INFO MainThread:3574822 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:31:19,038 INFO MainThread:3574822 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:31:19,039 INFO MainThread:3574822 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16'} +2025-12-07 09:32:07,979 INFO wandb-AsyncioManager-main:3574822 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:32:07,979 INFO wandb-AsyncioManager-main:3574822 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_093118-cbtmmkh2/run-cbtmmkh2.wandb b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/run-cbtmmkh2.wandb new file mode 100644 index 0000000000000000000000000000000000000000..2bcb251a38d26eba05792cdeef753f66df63b649 Binary files /dev/null and b/Meissonic/wandb/run-20251207_093118-cbtmmkh2/run-cbtmmkh2.wandb differ diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/files/config.yaml b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..93e5d40a08be35e150c74acaf0e16cf88e138ce2 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/config.yaml @@ -0,0 +1,284 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + p1ntjzhq7t4p988nwr3bli37yt2tdb2m: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - /mnt/Meissonic/model/diffusion_pytorch_model.safetensors + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625039691776" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:41:49.989220Z" + writerId: p1ntjzhq7t4p988nwr3bli37yt2tdb2m + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: /mnt/Meissonic/model/diffusion_pytorch_model.safetensors diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/files/output.log b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..046eb53fcf501d34ebf91dbb9e4f418001e7c863 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/output.log @@ -0,0 +1,119 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4541.40it/s] +12/07/2025 09:41:57 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:41:57 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:42:21 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Meissonic/model/diffusion_pytorch_model.safetensors +12/07/2025 09:42:39 - WARNING - __main__ - Failed to load Wan pretrained weights: Could not find config.json in /mnt/Meissonic/model/diffusion_pytorch_model.safetensors, continuing with random initialization +12/07/2025 09:42:41 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:42:48 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:42:48 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:42:48 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:42:54 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:42:54 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:42:54 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 09:42:54 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 09:42:54 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1126, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 979, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook + _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn + fn(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 946, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 673, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 441, in forward + x = cross_attn_ffn(x, context, context_lens, e) + File "/mnt/Meissonic/src/transformer_video.py", line 435, in cross_attn_ffn + y = self.ffn( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward + input = module(input) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward + return F.gelu(input, approximate=self.approximate) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 20.62 MiB is free. Process 3580645 has 414.00 MiB memory in use. Process 3580643 has 414.00 MiB memory in use. Process 3580646 has 414.00 MiB memory in use. Process 3580644 has 414.00 MiB memory in use. Process 3580647 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.59 GiB memory in use. Process 3580649 has 414.00 MiB memory in use. Process 3580648 has 414.00 MiB memory in use. Of the allocated memory 34.89 GiB is allocated by PyTorch, and 609.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1126, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 979, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook +[rank0]: _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn +[rank0]: fn(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 946, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 673, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 441, in forward +[rank0]: x = cross_attn_ffn(x, context, context_lens, e) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 435, in cross_attn_ffn +[rank0]: y = self.ffn( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward +[rank0]: input = module(input) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward +[rank0]: return F.gelu(input, approximate=self.approximate) +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 20.62 MiB is free. Process 3580645 has 414.00 MiB memory in use. Process 3580643 has 414.00 MiB memory in use. Process 3580646 has 414.00 MiB memory in use. Process 3580644 has 414.00 MiB memory in use. Process 3580647 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.59 GiB memory in use. Process 3580649 has 414.00 MiB memory in use. Process 3580648 has 414.00 MiB memory in use. Of the allocated memory 34.89 GiB is allocated by PyTorch, and 609.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/files/requirements.txt b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..98d83b7b0e41f5141bbb9187c33fba603a9a2c97 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:41:49.989220Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "/mnt/Meissonic/model/diffusion_pytorch_model.safetensors", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625039691776" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "p1ntjzhq7t4p988nwr3bli37yt2tdb2m" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/files/wandb-summary.json b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..48f7d9a062ccb1f6a06780718d54c131168a079a --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":77},"_runtime":77} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-core.log b/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ebb2b693142a930147428f5ac653e2526e1eb4d7 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:41:50.064639005Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp_rg_vfge/port-3580642.txt","pid":3580642,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:41:50.065147717Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3580642} +{"time":"2025-12-07T09:41:50.065124372Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3580642-3580878-4246946937/socket","Net":"unix"}} +{"time":"2025-12-07T09:41:50.250706233Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:41:50.257227926Z","level":"INFO","msg":"handleInformInit: received","streamId":"lf2olalq","id":"1(@)"} +{"time":"2025-12-07T09:41:50.426130053Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"lf2olalq","id":"1(@)"} +{"time":"2025-12-07T09:43:08.504831904Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:43:08.504894464Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:43:08.504886039Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:43:08.505020388Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:43:08.504991757Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3580642-3580878-4246946937/socket","Net":"unix"}} +{"time":"2025-12-07T09:43:08.875217291Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:43:08.875244447Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:43:08.875258058Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-internal.log b/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..59d450e0b1864f99888750ffaac675c101528d94 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:41:50.257348078Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:41:50.425875816Z","level":"INFO","msg":"stream: created new stream","id":"lf2olalq"} +{"time":"2025-12-07T09:41:50.425996268Z","level":"INFO","msg":"handler: started","stream_id":"lf2olalq"} +{"time":"2025-12-07T09:41:50.426122233Z","level":"INFO","msg":"stream: started","id":"lf2olalq"} +{"time":"2025-12-07T09:41:50.426136835Z","level":"INFO","msg":"sender: started","stream_id":"lf2olalq"} +{"time":"2025-12-07T09:41:50.426140773Z","level":"INFO","msg":"writer: started","stream_id":"lf2olalq"} +{"time":"2025-12-07T09:43:08.504915015Z","level":"INFO","msg":"stream: closing","id":"lf2olalq"} +{"time":"2025-12-07T09:43:08.752887843Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:43:08.87209878Z","level":"INFO","msg":"handler: closed","stream_id":"lf2olalq"} +{"time":"2025-12-07T09:43:08.872200303Z","level":"INFO","msg":"sender: closed","stream_id":"lf2olalq"} +{"time":"2025-12-07T09:43:08.872218053Z","level":"INFO","msg":"stream: closed","id":"lf2olalq"} diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug.log b/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..694a4577eb9f3faebb68673d2fbb9c5e29bb80df --- /dev/null +++ b/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_setup.py:_flush():80] Configure stats pid to 3580642 +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug.log +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_094149-lf2olalq/logs/debug-internal.log +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:41:49,992 INFO MainThread:3580642 [wandb_init.py:init():889] starting backend +2025-12-07 09:41:50,250 INFO MainThread:3580642 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:41:50,255 INFO MainThread:3580642 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:41:50,258 INFO MainThread:3580642 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:41:50,262 INFO MainThread:3580642 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:41:50,700 INFO MainThread:3580642 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:41:50,809 INFO MainThread:3580642 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:41:50,809 INFO MainThread:3580642 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:41:50,809 INFO MainThread:3580642 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:41:50,809 INFO MainThread:3580642 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:41:50,812 INFO MainThread:3580642 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:41:50,812 INFO MainThread:3580642 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': '/mnt/Meissonic/model/diffusion_pytorch_model.safetensors'} +2025-12-07 09:43:08,504 INFO wandb-AsyncioManager-main:3580642 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:43:08,505 INFO wandb-AsyncioManager-main:3580642 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_094149-lf2olalq/run-lf2olalq.wandb b/Meissonic/wandb/run-20251207_094149-lf2olalq/run-lf2olalq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f0c349ba431935a83e340feca7579e8a8af6a67f Binary files /dev/null and b/Meissonic/wandb/run-20251207_094149-lf2olalq/run-lf2olalq.wandb differ diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/config.yaml b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a192927d1a8ea8b2b8f342a90f8c15fd352f2ee0 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/config.yaml @@ -0,0 +1,284 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + f2gqpygidlfsv61v2zwawx8i761aqkd8: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11625039831040" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:43:29.106543Z" + writerId: f2gqpygidlfsv61v2zwawx8i761aqkd8 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/output.log b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..11fab84142271d27c14604fd853cc64e7505dded --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/output.log @@ -0,0 +1,943 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7943.76it/s] +12/07/2025 09:43:36 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:43:36 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:44:01 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 09:44:04 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for patch_embedding.weight: copying a param with shape torch.Size([1536, 16, 1, 2, 2]) from checkpoint, the shape in current model is torch.Size([2048, 16, 1, 2, 2]). + size mismatch for patch_embedding.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([2048, 768]). + size mismatch for text_embedding.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for text_embedding.2.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for text_embedding.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for time_embedding.0.weight: copying a param with shape torch.Size([1536, 256]) from checkpoint, the shape in current model is torch.Size([2048, 256]). + size mismatch for time_embedding.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for time_embedding.2.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for time_embedding.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for time_projection.1.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([12288, 2048]). + size mismatch for time_projection.1.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([12288]). + size mismatch for blocks.0.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.0.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.0.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.0.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.0.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.0.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.0.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.1.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.1.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.1.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.1.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.1.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.1.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.2.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.2.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.2.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.2.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.2.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.2.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.3.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.3.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.3.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.3.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.3.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.3.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.4.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.4.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.4.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.4.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.4.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.4.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.5.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.5.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.5.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.5.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.5.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.5.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.6.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.6.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.6.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.6.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.6.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.6.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.7.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.7.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.7.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.7.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.7.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.7.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.8.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.8.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.8.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.8.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.8.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.8.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.9.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.9.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.9.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.9.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.9.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.9.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.10.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.10.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.10.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.10.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.10.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.10.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.11.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.11.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.11.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.11.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.11.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.11.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.12.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.12.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.12.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.12.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.12.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.12.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.13.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.13.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.13.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.13.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.13.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.13.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.14.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.14.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.14.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.14.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.14.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.14.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.15.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.15.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.15.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.15.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.15.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.15.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.16.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.16.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.16.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.16.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.16.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.16.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.17.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.17.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.17.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.17.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.17.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.17.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.18.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.18.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.18.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.18.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.18.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.18.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.19.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.19.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.19.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.19.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.19.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.19.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.20.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.20.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.20.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.20.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.20.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.20.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.21.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.21.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.21.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.21.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.21.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.21.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.22.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.22.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.22.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.22.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.22.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.22.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.23.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.23.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.23.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.23.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.23.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.23.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.24.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.24.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.24.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.24.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.24.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.24.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.25.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.25.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.25.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.25.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.25.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.25.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.26.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.26.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.26.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.26.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.26.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.26.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.27.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.27.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.27.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.27.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.27.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.27.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.28.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.28.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.28.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.28.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.28.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.28.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.modulation: copying a param with shape torch.Size([1, 6, 1536]) from checkpoint, the shape in current model is torch.Size([1, 6, 2048]). + size mismatch for blocks.29.self_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.self_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.self_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.self_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.self_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.self_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.self_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.self_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.self_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.self_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.norm3.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.norm3.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.cross_attn.q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.cross_attn.q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.cross_attn.k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.cross_attn.k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.cross_attn.v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.cross_attn.v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.cross_attn.o.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([2048, 2048]). + size mismatch for blocks.29.cross_attn.o.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.cross_attn.norm_q.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.cross_attn.norm_k.weight: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for blocks.29.ffn.0.weight: copying a param with shape torch.Size([8960, 1536]) from checkpoint, the shape in current model is torch.Size([8192, 2048]). + size mismatch for blocks.29.ffn.0.bias: copying a param with shape torch.Size([8960]) from checkpoint, the shape in current model is torch.Size([8192]). + size mismatch for blocks.29.ffn.2.weight: copying a param with shape torch.Size([1536, 8960]) from checkpoint, the shape in current model is torch.Size([2048, 8192]). + size mismatch for blocks.29.ffn.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([2048]). + size mismatch for head.modulation: copying a param with shape torch.Size([1, 2, 1536]) from checkpoint, the shape in current model is torch.Size([1, 2, 2048]). + size mismatch for head.head.weight: copying a param with shape torch.Size([64, 1536]) from checkpoint, the shape in current model is torch.Size([64, 2048])., continuing with random initialization +12/07/2025 09:44:05 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:44:12 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:44:12 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:44:12 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:44:18 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:44:18 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:44:18 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 09:44:18 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 09:44:18 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1126, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 979, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook + _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn + fn(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 946, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 673, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 441, in forward + x = cross_attn_ffn(x, context, context_lens, e) + File "/mnt/Meissonic/src/transformer_video.py", line 435, in cross_attn_ffn + y = self.ffn( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward + input = module(input) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward + return F.gelu(input, approximate=self.approximate) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 20.62 MiB is free. Process 3585417 has 414.00 MiB memory in use. Process 3585418 has 414.00 MiB memory in use. Process 3585416 has 414.00 MiB memory in use. Process 3585414 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.59 GiB memory in use. Process 3585413 has 414.00 MiB memory in use. Process 3585412 has 414.00 MiB memory in use. Process 3585415 has 414.00 MiB memory in use. Of the allocated memory 34.89 GiB is allocated by PyTorch, and 609.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1126, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 979, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook +[rank0]: _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn +[rank0]: fn(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 946, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 673, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 441, in forward +[rank0]: x = cross_attn_ffn(x, context, context_lens, e) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 435, in cross_attn_ffn +[rank0]: y = self.ffn( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward +[rank0]: input = module(input) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward +[rank0]: return F.gelu(input, approximate=self.approximate) +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 20.62 MiB is free. Process 3585417 has 414.00 MiB memory in use. Process 3585418 has 414.00 MiB memory in use. Process 3585416 has 414.00 MiB memory in use. Process 3585414 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.59 GiB memory in use. Process 3585413 has 414.00 MiB memory in use. Process 3585412 has 414.00 MiB memory in use. Process 3585415 has 414.00 MiB memory in use. Of the allocated memory 34.89 GiB is allocated by PyTorch, and 609.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/requirements.txt b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..14489e713cbd36b0f8d7a488c8b5d7d208064d21 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:43:29.106543Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11625039831040" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "f2gqpygidlfsv61v2zwawx8i761aqkd8" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/wandb-summary.json b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7e8f5e934a7f744fd58fa7af47ff8a9e71aad891 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":59},"_runtime":59} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-core.log b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b3d815d868c64fe9f5c325de9996b91958e478a0 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:43:29.177181849Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpxqn1wppq/port-3585411.txt","pid":3585411,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:43:29.177759501Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3585411} +{"time":"2025-12-07T09:43:29.177773557Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3585411-3585693-3601256257/socket","Net":"unix"}} +{"time":"2025-12-07T09:43:29.364054969Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:43:29.36990525Z","level":"INFO","msg":"handleInformInit: received","streamId":"qf4q6gjw","id":"1(@)"} +{"time":"2025-12-07T09:43:29.541209706Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"qf4q6gjw","id":"1(@)"} +{"time":"2025-12-07T09:44:29.703387929Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T09:44:29.703449896Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T09:44:29.703439515Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T09:44:29.703509913Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T09:44:29.703552693Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3585411-3585693-3601256257/socket","Net":"unix"}} +{"time":"2025-12-07T09:44:30.156490439Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T09:44:30.156518522Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T09:44:30.156529283Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-internal.log b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..7f5d487813f1833847138b50187680d0c782936e --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:43:29.37001376Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:43:29.541024833Z","level":"INFO","msg":"stream: created new stream","id":"qf4q6gjw"} +{"time":"2025-12-07T09:43:29.54110874Z","level":"INFO","msg":"handler: started","stream_id":"qf4q6gjw"} +{"time":"2025-12-07T09:43:29.541200413Z","level":"INFO","msg":"stream: started","id":"qf4q6gjw"} +{"time":"2025-12-07T09:43:29.54121393Z","level":"INFO","msg":"writer: started","stream_id":"qf4q6gjw"} +{"time":"2025-12-07T09:43:29.541213688Z","level":"INFO","msg":"sender: started","stream_id":"qf4q6gjw"} +{"time":"2025-12-07T09:44:29.703450129Z","level":"INFO","msg":"stream: closing","id":"qf4q6gjw"} +{"time":"2025-12-07T09:44:29.975689445Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T09:44:30.153139577Z","level":"INFO","msg":"handler: closed","stream_id":"qf4q6gjw"} +{"time":"2025-12-07T09:44:30.153249605Z","level":"INFO","msg":"sender: closed","stream_id":"qf4q6gjw"} +{"time":"2025-12-07T09:44:30.153261851Z","level":"INFO","msg":"stream: closed","id":"qf4q6gjw"} diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug.log b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..bcb40b416ae3d90c149f6d29bc425273e6c01687 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_setup.py:_flush():80] Configure stats pid to 3585411 +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug.log +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_094329-qf4q6gjw/logs/debug-internal.log +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:43:29,111 INFO MainThread:3585411 [wandb_init.py:init():889] starting backend +2025-12-07 09:43:29,363 INFO MainThread:3585411 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:43:29,368 INFO MainThread:3585411 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:43:29,369 INFO MainThread:3585411 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:43:29,374 INFO MainThread:3585411 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:43:29,711 INFO MainThread:3585411 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:43:29,864 INFO MainThread:3585411 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:43:29,864 INFO MainThread:3585411 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:43:29,864 INFO MainThread:3585411 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:43:29,864 INFO MainThread:3585411 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:43:29,867 INFO MainThread:3585411 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:43:29,868 INFO MainThread:3585411 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 09:44:29,703 INFO wandb-AsyncioManager-main:3585411 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 09:44:29,703 INFO wandb-AsyncioManager-main:3585411 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_094329-qf4q6gjw/run-qf4q6gjw.wandb b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/run-qf4q6gjw.wandb new file mode 100644 index 0000000000000000000000000000000000000000..75162ae8cb703879a2073a49389f54de0db88e00 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094329-qf4q6gjw/run-qf4q6gjw.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5e1dbf6a99f47d687d5989380d6df233812b32fb203df051052f59812538eb +size 167372 diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/config.yaml b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e9b6d77444b88a2b79290d4ca659c5e41ab5d56f --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + ppy4m183yaci9erpm1qqdstkz01uu42w: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "50" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11630716583936" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T09:47:15.462893Z" + writerId: ppy4m183yaci9erpm1qqdstkz01uu42w + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/output.log b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1c499ad43a37f6b805b757d5f1f5edbc5e0ea246 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/output.log @@ -0,0 +1,53 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11500.25it/s] +12/07/2025 09:47:23 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 09:47:23 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 09:47:23 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 09:47:23 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 09:47:39 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 09:47:41 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768])., continuing with random initialization +12/07/2025 09:47:41 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 09:47:48 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 09:47:48 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 09:47:48 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 09:47:53 - INFO - __main__ - ***** Running training ***** +12/07/2025 09:47:53 - INFO - __main__ - Num training steps = 10000 +12/07/2025 09:47:53 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 09:47:53 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 09:47:53 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 09:50:23 - INFO - __main__ - Step: 50 Loss: 11.0876 LR: 0.000300 +12/07/2025 09:52:47 - INFO - __main__ - Step: 100 Loss: 11.0012 LR: 0.000300 +12/07/2025 09:55:11 - INFO - __main__ - Step: 150 Loss: 10.7584 LR: 0.000300 +12/07/2025 09:57:36 - INFO - __main__ - Step: 200 Loss: 10.5416 LR: 0.000300 +12/07/2025 09:59:59 - INFO - __main__ - Step: 250 Loss: 10.2762 LR: 0.000300 +12/07/2025 10:02:24 - INFO - __main__ - Step: 300 Loss: 10.3201 LR: 0.000300 +12/07/2025 10:04:48 - INFO - __main__ - Step: 350 Loss: 10.2970 LR: 0.000300 +12/07/2025 10:07:13 - INFO - __main__ - Step: 400 Loss: 10.3445 LR: 0.000300 +12/07/2025 10:09:36 - INFO - __main__ - Step: 450 Loss: 10.2980 LR: 0.000300 +12/07/2025 10:11:59 - INFO - __main__ - Step: 500 Loss: 10.1985 LR: 0.000300 +12/07/2025 10:11:59 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-500 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1200, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1082, in main + save_checkpoint(args, accelerator, global_step + 1, logger) + File "/mnt/Meissonic/train/trainer_utils.py", line 44, in save_checkpoint + accelerator.save_state(save_path) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 3695, in save_state + hook(self._models, weights, output_dir) + File "/mnt/Meissonic/train/train_mei_video.py", line 683, in save_model_hook + raise ValueError(f"unexpected save model: {model_.__class__}") +ValueError: unexpected save model: +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1200, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1082, in main +[rank0]: save_checkpoint(args, accelerator, global_step + 1, logger) +[rank0]: File "/mnt/Meissonic/train/trainer_utils.py", line 44, in save_checkpoint +[rank0]: accelerator.save_state(save_path) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 3695, in save_state +[rank0]: hook(self._models, weights, output_dir) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 683, in save_model_hook +[rank0]: raise ValueError(f"unexpected save model: {model_.__class__}") +[rank0]: ValueError: unexpected save model: diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/requirements.txt b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f74dc15af450bd1ba73a6d1da5e2a30b6276d395 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T09:47:15.462893Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "50", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11630716583936" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "ppy4m183yaci9erpm1qqdstkz01uu42w" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/wandb-summary.json b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..25eeae506c8c3cfa915928d211c15f043d3db77c --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/files/wandb-summary.json @@ -0,0 +1 @@ +{"lr":0.0003,"avg_masking_rate":0.957585334777832,"_timestamp":1.7651023195548415e+09,"_wandb":{"runtime":1483},"_runtime":1483.490575788,"_step":500,"step_loss":10.198528289794922} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-core.log b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..e70fc50bbf9888c73a9cb308813a47519c8c4bff --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T09:47:15.530497283Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp5otziggh/port-3590186.txt","pid":3590186,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T09:47:15.531073735Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3590186} +{"time":"2025-12-07T09:47:15.531048276Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3590186-3590414-2919337043/socket","Net":"unix"}} +{"time":"2025-12-07T09:47:15.718001833Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T09:47:15.724970531Z","level":"INFO","msg":"handleInformInit: received","streamId":"uvgb9hvt","id":"1(@)"} +{"time":"2025-12-07T09:47:15.895057223Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"uvgb9hvt","id":"1(@)"} +{"time":"2025-12-07T10:11:59.586605339Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T10:11:59.586671189Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T10:11:59.586665758Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T10:11:59.586751908Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T10:11:59.586790876Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3590186-3590414-2919337043/socket","Net":"unix"}} +{"time":"2025-12-07T10:11:59.953899556Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T10:11:59.95396061Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T10:11:59.95398758Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-internal.log b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8aa9cf7a9513bb5103112c2916aad4e770ec9e0a --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T09:47:15.725110585Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T09:47:15.894785049Z","level":"INFO","msg":"stream: created new stream","id":"uvgb9hvt"} +{"time":"2025-12-07T09:47:15.894954103Z","level":"INFO","msg":"handler: started","stream_id":"uvgb9hvt"} +{"time":"2025-12-07T09:47:15.895049514Z","level":"INFO","msg":"stream: started","id":"uvgb9hvt"} +{"time":"2025-12-07T09:47:15.895062946Z","level":"INFO","msg":"writer: started","stream_id":"uvgb9hvt"} +{"time":"2025-12-07T09:47:15.895066576Z","level":"INFO","msg":"sender: started","stream_id":"uvgb9hvt"} +{"time":"2025-12-07T10:11:59.586672175Z","level":"INFO","msg":"stream: closing","id":"uvgb9hvt"} +{"time":"2025-12-07T10:11:59.840804359Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T10:11:59.949791101Z","level":"INFO","msg":"handler: closed","stream_id":"uvgb9hvt"} +{"time":"2025-12-07T10:11:59.949985685Z","level":"INFO","msg":"sender: closed","stream_id":"uvgb9hvt"} +{"time":"2025-12-07T10:11:59.950006182Z","level":"INFO","msg":"stream: closed","id":"uvgb9hvt"} diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug.log b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..c518fefe31404d9d8e5c1b653617512527adf1c7 --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_setup.py:_flush():80] Configure stats pid to 3590186 +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug.log +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_094715-uvgb9hvt/logs/debug-internal.log +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_init.py:init():841] calling init triggers +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 09:47:15,465 INFO MainThread:3590186 [wandb_init.py:init():889] starting backend +2025-12-07 09:47:15,717 INFO MainThread:3590186 [wandb_init.py:init():892] sending inform_init request +2025-12-07 09:47:15,723 INFO MainThread:3590186 [wandb_init.py:init():900] backend started and connected +2025-12-07 09:47:15,725 INFO MainThread:3590186 [wandb_init.py:init():970] updated telemetry +2025-12-07 09:47:15,731 INFO MainThread:3590186 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 09:47:16,095 INFO MainThread:3590186 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 09:47:16,273 INFO MainThread:3590186 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 09:47:16,273 INFO MainThread:3590186 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 09:47:16,273 INFO MainThread:3590186 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 09:47:16,273 INFO MainThread:3590186 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 09:47:16,277 INFO MainThread:3590186 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 09:47:16,278 INFO MainThread:3590186 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 10:11:59,586 INFO wandb-AsyncioManager-main:3590186 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 10:11:59,586 INFO wandb-AsyncioManager-main:3590186 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_094715-uvgb9hvt/run-uvgb9hvt.wandb b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/run-uvgb9hvt.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b52efd9afd22bf5a05e0f00ef969562c3b98363a --- /dev/null +++ b/Meissonic/wandb/run-20251207_094715-uvgb9hvt/run-uvgb9hvt.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10df54039e3f89568c055b9940920535f0dca940cb0b9e2418f8651fb77f9935 +size 361950 diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/config.yaml b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e4cb906a449e881fc4ee912361a7b6c601596ef4 --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + t2zchsd6jt0bqckdewza5t1tcccjtdhd: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "100" + - --validation_steps + - "100" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11630717394944" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T10:16:12.898674Z" + writerId: t2zchsd6jt0bqckdewza5t1tcccjtdhd + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 100 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 100 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/output.log b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f9f5304d71a3dc36e0142d3946d43ad7db28fa8e --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/output.log @@ -0,0 +1,53 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 12167.48it/s] +12/07/2025 10:16:20 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 10:16:20 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 10:16:20 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:16:21 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 10:16:37 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:16:39 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768])., continuing with random initialization +12/07/2025 10:16:39 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 10:16:46 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 10:16:46 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 10:16:46 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 10:16:50 - INFO - __main__ - ***** Running training ***** +12/07/2025 10:16:50 - INFO - __main__ - Num training steps = 10000 +12/07/2025 10:16:50 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 10:16:50 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 10:16:50 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 10:17:23 - INFO - __main__ - Step: 10 Loss: 11.1008 LR: 0.000300 +12/07/2025 10:17:52 - INFO - __main__ - Step: 20 Loss: 11.2805 LR: 0.000300 +12/07/2025 10:18:20 - INFO - __main__ - Step: 30 Loss: 11.1139 LR: 0.000300 +12/07/2025 10:18:48 - INFO - __main__ - Step: 40 Loss: 11.0921 LR: 0.000300 +12/07/2025 10:19:15 - INFO - __main__ - Step: 50 Loss: 11.0870 LR: 0.000300 +12/07/2025 10:19:42 - INFO - __main__ - Step: 60 Loss: 11.0776 LR: 0.000300 +12/07/2025 10:20:10 - INFO - __main__ - Step: 70 Loss: 11.0631 LR: 0.000300 +12/07/2025 10:20:38 - INFO - __main__ - Step: 80 Loss: 11.0385 LR: 0.000300 +12/07/2025 10:21:06 - INFO - __main__ - Step: 90 Loss: 11.0306 LR: 0.000300 +12/07/2025 10:21:33 - INFO - __main__ - Step: 100 Loss: 11.0011 LR: 0.000300 +12/07/2025 10:21:33 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-100 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1207, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1089, in main + save_checkpoint(args, accelerator, global_step + 1, logger) + File "/mnt/Meissonic/train/trainer_utils.py", line 44, in save_checkpoint + accelerator.save_state(save_path) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 3695, in save_state + hook(self._models, weights, output_dir) + File "/mnt/Meissonic/train/train_mei_video.py", line 690, in save_model_hook + raise ValueError(f"unexpected save model: {model_.__class__}, unwrapped: {unwrapped_model_.__class__}") +ValueError: unexpected save model: , unwrapped: +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1207, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1089, in main +[rank0]: save_checkpoint(args, accelerator, global_step + 1, logger) +[rank0]: File "/mnt/Meissonic/train/trainer_utils.py", line 44, in save_checkpoint +[rank0]: accelerator.save_state(save_path) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 3695, in save_state +[rank0]: hook(self._models, weights, output_dir) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 690, in save_model_hook +[rank0]: raise ValueError(f"unexpected save model: {model_.__class__}, unwrapped: {unwrapped_model_.__class__}") +[rank0]: ValueError: unexpected save model: , unwrapped: diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/requirements.txt b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5097e324632a8aa23cea9225c97255a46994de2a --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T10:16:12.898674Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "100", + "--validation_steps", + "100", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11630717394944" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "t2zchsd6jt0bqckdewza5t1tcccjtdhd" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/wandb-summary.json b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..382eb362d94357f30789468b8adf450991fa7b81 --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":320},"_runtime":320.221997214,"avg_masking_rate":0.335297554731369,"_timestamp":1.7651028939071417e+09,"_step":100,"step_loss":11.001090049743652,"lr":0.0003} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-core.log b/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b03b1b579588f24672b5eb070404cc03cd7bdaf6 --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T10:16:12.964350134Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpldsset8g/port-3730296.txt","pid":3730296,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T10:16:12.964806772Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3730296} +{"time":"2025-12-07T10:16:12.964802288Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3730296-3730526-1945501427/socket","Net":"unix"}} +{"time":"2025-12-07T10:16:13.151681844Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T10:16:13.157955233Z","level":"INFO","msg":"handleInformInit: received","streamId":"n95rfdqt","id":"1(@)"} +{"time":"2025-12-07T10:16:13.323944512Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"n95rfdqt","id":"1(@)"} +{"time":"2025-12-07T10:21:33.937958137Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T10:21:33.938031919Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T10:21:33.93802301Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T10:21:33.938140821Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T10:21:33.938135837Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3730296-3730526-1945501427/socket","Net":"unix"}} +{"time":"2025-12-07T10:21:34.293488786Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T10:21:34.293511618Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T10:21:34.29351879Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-internal.log b/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8975d15dcd7add24b6377e325e0b7830d9336af8 --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T10:16:13.158065481Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T10:16:13.323692954Z","level":"INFO","msg":"stream: created new stream","id":"n95rfdqt"} +{"time":"2025-12-07T10:16:13.323798069Z","level":"INFO","msg":"handler: started","stream_id":"n95rfdqt"} +{"time":"2025-12-07T10:16:13.32393586Z","level":"INFO","msg":"stream: started","id":"n95rfdqt"} +{"time":"2025-12-07T10:16:13.323952667Z","level":"INFO","msg":"sender: started","stream_id":"n95rfdqt"} +{"time":"2025-12-07T10:16:13.323955837Z","level":"INFO","msg":"writer: started","stream_id":"n95rfdqt"} +{"time":"2025-12-07T10:21:33.938037249Z","level":"INFO","msg":"stream: closing","id":"n95rfdqt"} +{"time":"2025-12-07T10:21:34.19788378Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T10:21:34.290077199Z","level":"INFO","msg":"handler: closed","stream_id":"n95rfdqt"} +{"time":"2025-12-07T10:21:34.290177383Z","level":"INFO","msg":"sender: closed","stream_id":"n95rfdqt"} +{"time":"2025-12-07T10:21:34.290194642Z","level":"INFO","msg":"stream: closed","id":"n95rfdqt"} diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug.log b/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..627dcf2dab412808b5c0abb47c0b0f8330bd130d --- /dev/null +++ b/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_setup.py:_flush():80] Configure stats pid to 3730296 +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug.log +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_101612-n95rfdqt/logs/debug-internal.log +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_init.py:init():841] calling init triggers +2025-12-07 10:16:12,901 INFO MainThread:3730296 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 10:16:12,902 INFO MainThread:3730296 [wandb_init.py:init():889] starting backend +2025-12-07 10:16:13,151 INFO MainThread:3730296 [wandb_init.py:init():892] sending inform_init request +2025-12-07 10:16:13,156 INFO MainThread:3730296 [wandb_init.py:init():900] backend started and connected +2025-12-07 10:16:13,157 INFO MainThread:3730296 [wandb_init.py:init():970] updated telemetry +2025-12-07 10:16:13,162 INFO MainThread:3730296 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 10:16:13,715 INFO MainThread:3730296 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 10:16:13,828 INFO MainThread:3730296 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 10:16:13,829 INFO MainThread:3730296 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 10:16:13,829 INFO MainThread:3730296 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 10:16:13,829 INFO MainThread:3730296 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 10:16:13,832 INFO MainThread:3730296 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 10:16:13,833 INFO MainThread:3730296 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 100, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 10:21:33,938 INFO wandb-AsyncioManager-main:3730296 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 10:21:33,938 INFO wandb-AsyncioManager-main:3730296 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_101612-n95rfdqt/run-n95rfdqt.wandb b/Meissonic/wandb/run-20251207_101612-n95rfdqt/run-n95rfdqt.wandb new file mode 100644 index 0000000000000000000000000000000000000000..0536768fc9c5891f1ac5957eb3435df9b80646cf Binary files /dev/null and b/Meissonic/wandb/run-20251207_101612-n95rfdqt/run-n95rfdqt.wandb differ diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/config.yaml b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83dc3dac0d9ce80167433faecfbfbeb8913e8db6 --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + pufgsiuucg7dwajur84am7d33j14bmif: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "10" + - --validation_steps + - "100" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11630717865984" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T10:24:54.751174Z" + writerId: pufgsiuucg7dwajur84am7d33j14bmif + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 10 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 100 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/output.log b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..509f068e2f02b76f447d0df38bc4aa8604d1fb49 --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/output.log @@ -0,0 +1,110 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3693.10it/s] +12/07/2025 10:25:02 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 10:25:02 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 10:25:02 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:25:02 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 10:25:18 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:25:20 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768])., continuing with random initialization +12/07/2025 10:25:20 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 10:25:27 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 10:25:27 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 10:25:27 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 10:25:32 - INFO - __main__ - ***** Running training ***** +12/07/2025 10:25:32 - INFO - __main__ - Num training steps = 10000 +12/07/2025 10:25:32 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 10:25:32 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 10:25:32 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 10:26:05 - INFO - __main__ - Step: 10 Loss: 11.1009 LR: 0.000300 +12/07/2025 10:26:05 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-10 +12/07/2025 10:26:17 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-10/optimizer.bin +12/07/2025 10:26:17 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-10/scheduler.bin +12/07/2025 10:26:17 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-10/sampler.bin +12/07/2025 10:26:17 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-10/random_states_0.pkl +12/07/2025 10:26:17 - INFO - __main__ - Saved state to output/checkpoint-10 +12/07/2025 10:26:44 - INFO - __main__ - Step: 20 Loss: 11.3026 LR: 0.000300 +12/07/2025 10:26:44 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-20 +12/07/2025 10:26:56 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-20/optimizer.bin +12/07/2025 10:26:56 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-20/scheduler.bin +12/07/2025 10:26:56 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-20/sampler.bin +12/07/2025 10:26:56 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-20/random_states_0.pkl +12/07/2025 10:26:56 - INFO - __main__ - Saved state to output/checkpoint-20 +12/07/2025 10:27:23 - INFO - __main__ - Step: 30 Loss: 11.1147 LR: 0.000300 +12/07/2025 10:27:23 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-30 +12/07/2025 10:27:35 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-30/optimizer.bin +12/07/2025 10:27:35 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-30/scheduler.bin +12/07/2025 10:27:35 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-30/sampler.bin +12/07/2025 10:27:35 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-30/random_states_0.pkl +12/07/2025 10:27:35 - INFO - __main__ - Saved state to output/checkpoint-30 +12/07/2025 10:28:01 - INFO - __main__ - Step: 40 Loss: 11.0919 LR: 0.000300 +12/07/2025 10:28:01 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-40 +12/07/2025 10:28:12 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-40/optimizer.bin +12/07/2025 10:28:12 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-40/scheduler.bin +12/07/2025 10:28:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-40/sampler.bin +12/07/2025 10:28:12 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-40/random_states_0.pkl +12/07/2025 10:28:12 - INFO - __main__ - Saved state to output/checkpoint-40 +12/07/2025 10:28:37 - INFO - __main__ - Step: 50 Loss: 11.0874 LR: 0.000300 +12/07/2025 10:28:37 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-50 +12/07/2025 10:28:49 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-50/optimizer.bin +12/07/2025 10:28:49 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-50/scheduler.bin +12/07/2025 10:28:49 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-50/sampler.bin +12/07/2025 10:28:49 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-50/random_states_0.pkl +12/07/2025 10:28:49 - INFO - __main__ - Saved state to output/checkpoint-50 +12/07/2025 10:29:15 - INFO - __main__ - Step: 60 Loss: 11.0777 LR: 0.000300 +12/07/2025 10:29:15 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-60 +12/07/2025 10:29:26 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-60/optimizer.bin +12/07/2025 10:29:26 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-60/scheduler.bin +12/07/2025 10:29:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-60/sampler.bin +12/07/2025 10:29:26 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-60/random_states_0.pkl +12/07/2025 10:29:26 - INFO - __main__ - Saved state to output/checkpoint-60 +12/07/2025 10:29:52 - INFO - __main__ - Step: 70 Loss: 11.0639 LR: 0.000300 +12/07/2025 10:29:52 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-70 +12/07/2025 10:30:04 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-70/optimizer.bin +12/07/2025 10:30:04 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-70/scheduler.bin +12/07/2025 10:30:04 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-70/sampler.bin +12/07/2025 10:30:04 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-70/random_states_0.pkl +12/07/2025 10:30:04 - INFO - __main__ - Saved state to output/checkpoint-70 +12/07/2025 10:30:30 - INFO - __main__ - Step: 80 Loss: 11.0408 LR: 0.000300 +12/07/2025 10:30:30 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-80 +12/07/2025 10:30:42 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-80/optimizer.bin +12/07/2025 10:30:42 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-80/scheduler.bin +12/07/2025 10:30:42 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-80/sampler.bin +12/07/2025 10:30:42 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-80/random_states_0.pkl +12/07/2025 10:30:42 - INFO - __main__ - Saved state to output/checkpoint-80 +12/07/2025 10:31:08 - INFO - __main__ - Step: 90 Loss: 11.0328 LR: 0.000300 +12/07/2025 10:31:08 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-90 +12/07/2025 10:31:19 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-90/optimizer.bin +12/07/2025 10:31:19 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-90/scheduler.bin +12/07/2025 10:31:19 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-90/sampler.bin +12/07/2025 10:31:19 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-90/random_states_0.pkl +12/07/2025 10:31:19 - INFO - __main__ - Saved state to output/checkpoint-90 +12/07/2025 10:31:46 - INFO - __main__ - Step: 100 Loss: 11.0054 LR: 0.000300 +12/07/2025 10:31:46 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-100 +12/07/2025 10:31:58 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-100/optimizer.bin +12/07/2025 10:31:58 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-100/scheduler.bin +12/07/2025 10:31:58 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-100/sampler.bin +12/07/2025 10:31:58 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-100/random_states_0.pkl +12/07/2025 10:31:58 - INFO - __main__ - Saved state to output/checkpoint-100 +12/07/2025 10:31:58 - INFO - __main__ - Generating videos for validation... +12/07/2025 10:31:58 - INFO - __main__ - Generating videos for validation... +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1213, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1118, in main + pipe = VideoPipeline( + File "/mnt/Meissonic/src/pipeline_video.py", line 324, in __init__ + assert transformer.num_frames == self.F_prime, ( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1964, in __getattr__ + raise AttributeError( +AttributeError: 'DistributedDataParallel' object has no attribute 'num_frames' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1213, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1118, in main +[rank0]: pipe = VideoPipeline( +[rank0]: File "/mnt/Meissonic/src/pipeline_video.py", line 324, in __init__ +[rank0]: assert transformer.num_frames == self.F_prime, ( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1964, in __getattr__ +[rank0]: raise AttributeError( +[rank0]: AttributeError: 'DistributedDataParallel' object has no attribute 'num_frames' diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/requirements.txt b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9ff14e8ec16c886c67fd01e74d3d4010249bac2a --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T10:24:54.751174Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "10", + "--validation_steps", + "100", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11630717865984" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "pufgsiuucg7dwajur84am7d33j14bmif" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/wandb-summary.json b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9cbba73a65381915e5d9354665ccff02341a7311 --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/files/wandb-summary.json @@ -0,0 +1 @@ +{"avg_masking_rate":0.335297554731369,"_wandb":{"runtime":423},"_runtime":423.190288624,"_timestamp":1.7651035069632998e+09,"_step":100,"step_loss":11.005426406860352,"lr":0.0003} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-core.log b/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..21df36279a0f7e42c5bd73d14538646be70d7d6b --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T10:24:54.818108547Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp2wjfrb7n/port-3761300.txt","pid":3761300,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T10:24:54.818545803Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3761300} +{"time":"2025-12-07T10:24:54.818562111Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3761300-3761566-173035857/socket","Net":"unix"}} +{"time":"2025-12-07T10:24:55.004421217Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T10:24:55.010830798Z","level":"INFO","msg":"handleInformInit: received","streamId":"nnww5mz8","id":"1(@)"} +{"time":"2025-12-07T10:24:55.179398975Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"nnww5mz8","id":"1(@)"} +{"time":"2025-12-07T10:31:58.586446778Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T10:31:58.586482038Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T10:31:58.586477528Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T10:31:58.586582377Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3761300-3761566-173035857/socket","Net":"unix"}} +{"time":"2025-12-07T10:31:58.586595646Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T10:31:59.215051546Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T10:31:59.215069607Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T10:31:59.215078482Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-internal.log b/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9e819c66923cac01e01eb0a9954b9d0f9f50d8dc --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T10:24:55.010952866Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T10:24:55.179220242Z","level":"INFO","msg":"stream: created new stream","id":"nnww5mz8"} +{"time":"2025-12-07T10:24:55.179297176Z","level":"INFO","msg":"handler: started","stream_id":"nnww5mz8"} +{"time":"2025-12-07T10:24:55.179391528Z","level":"INFO","msg":"stream: started","id":"nnww5mz8"} +{"time":"2025-12-07T10:24:55.179404012Z","level":"INFO","msg":"writer: started","stream_id":"nnww5mz8"} +{"time":"2025-12-07T10:24:55.179406835Z","level":"INFO","msg":"sender: started","stream_id":"nnww5mz8"} +{"time":"2025-12-07T10:31:58.586496905Z","level":"INFO","msg":"stream: closing","id":"nnww5mz8"} +{"time":"2025-12-07T10:31:58.8536618Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T10:31:59.078362009Z","level":"INFO","msg":"handler: closed","stream_id":"nnww5mz8"} +{"time":"2025-12-07T10:31:59.078428563Z","level":"INFO","msg":"sender: closed","stream_id":"nnww5mz8"} +{"time":"2025-12-07T10:31:59.07844197Z","level":"INFO","msg":"stream: closed","id":"nnww5mz8"} diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug.log b/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e3e354e3a6b575334e4b61784558fa3eba74234b --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_setup.py:_flush():80] Configure stats pid to 3761300 +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug.log +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_102454-nnww5mz8/logs/debug-internal.log +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_init.py:init():841] calling init triggers +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 10:24:54,754 INFO MainThread:3761300 [wandb_init.py:init():889] starting backend +2025-12-07 10:24:55,004 INFO MainThread:3761300 [wandb_init.py:init():892] sending inform_init request +2025-12-07 10:24:55,009 INFO MainThread:3761300 [wandb_init.py:init():900] backend started and connected +2025-12-07 10:24:55,010 INFO MainThread:3761300 [wandb_init.py:init():970] updated telemetry +2025-12-07 10:24:55,015 INFO MainThread:3761300 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 10:24:55,395 INFO MainThread:3761300 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 10:24:55,506 INFO MainThread:3761300 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 10:24:55,506 INFO MainThread:3761300 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 10:24:55,506 INFO MainThread:3761300 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 10:24:55,506 INFO MainThread:3761300 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 10:24:55,509 INFO MainThread:3761300 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 10:24:55,510 INFO MainThread:3761300 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 10, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 10:31:58,586 INFO wandb-AsyncioManager-main:3761300 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 10:31:58,586 INFO wandb-AsyncioManager-main:3761300 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_102454-nnww5mz8/run-nnww5mz8.wandb b/Meissonic/wandb/run-20251207_102454-nnww5mz8/run-nnww5mz8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..3987a6efeb7b569e3cffbeb02ef1c5b573608057 --- /dev/null +++ b/Meissonic/wandb/run-20251207_102454-nnww5mz8/run-nnww5mz8.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:052b3e37fd9d921d31d12813a0ff5aba51767863bfb2240df6fd0a805abbe905 +size 124049 diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/files/config.yaml b/Meissonic/wandb/run-20251207_103825-issokxuo/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f16bce9afd2266859d9caa7a6c914b4f2328c415 --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + bqwpsbwandwsd4n7dzvfyy718qxor0pt: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "10" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716175208448" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T10:38:25.447013Z" + writerId: bqwpsbwandwsd4n7dzvfyy718qxor0pt + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 10 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/files/output.log b/Meissonic/wandb/run-20251207_103825-issokxuo/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a8197bea10c75ccb165541d71301d54fd390a8a4 --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/files/output.log @@ -0,0 +1,37 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6571.20it/s] +12/07/2025 10:38:33 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 10:38:33 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 10:38:33 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:38:33 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 10:38:48 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:38:50 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768])., continuing with random initialization +12/07/2025 10:38:50 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 10:38:57 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 10:38:57 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 10:38:57 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 10:39:01 - INFO - __main__ - ***** Running training ***** +12/07/2025 10:39:01 - INFO - __main__ - Num training steps = 10000 +12/07/2025 10:39:01 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 10:39:01 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 10:39:01 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 10:39:35 - INFO - __main__ - Step: 10 Loss: 11.1007 LR: 0.000300 +12/07/2025 10:39:35 - INFO - __main__ - Generating videos for validation... +12/07/2025 10:39:35 - INFO - __main__ - Generating videos for validation... +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1227, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1132, in main + pipe = VideoPipeline( + File "/mnt/Meissonic/src/pipeline_video.py", line 324, in __init__ + assert transformer.num_frames == self.F_prime, ( +AssertionError: Transformer num_frames (3) must match compressed frames (2) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1227, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1132, in main +[rank0]: pipe = VideoPipeline( +[rank0]: File "/mnt/Meissonic/src/pipeline_video.py", line 324, in __init__ +[rank0]: assert transformer.num_frames == self.F_prime, ( +[rank0]: AssertionError: Transformer num_frames (3) must match compressed frames (2) diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/files/requirements.txt b/Meissonic/wandb/run-20251207_103825-issokxuo/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_103825-issokxuo/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4f584103586a3da93c9ce914618bea907cbb0990 --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T10:38:25.447013Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716175208448" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "bqwpsbwandwsd4n7dzvfyy718qxor0pt" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/files/wandb-summary.json b/Meissonic/wandb/run-20251207_103825-issokxuo/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..97caf3fac0bb2218a5897925f79a42d7299622ed --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/files/wandb-summary.json @@ -0,0 +1 @@ +{"lr":0.0003,"avg_masking_rate":0.45858341455459595,"_timestamp":1.7651039754151723e+09,"_step":10,"_wandb":{"runtime":69},"_runtime":69.181347361,"step_loss":11.1007080078125} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-core.log b/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..dacfe5bc944820b864a7ddf9d031907d6799b111 --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T10:38:25.516404679Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpz__xm9s_/port-3794655.txt","pid":3794655,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T10:38:25.516910509Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3794655} +{"time":"2025-12-07T10:38:25.516914595Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3794655-3794898-1742497710/socket","Net":"unix"}} +{"time":"2025-12-07T10:38:25.702529543Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T10:38:25.708761339Z","level":"INFO","msg":"handleInformInit: received","streamId":"issokxuo","id":"1(@)"} +{"time":"2025-12-07T10:38:25.876367309Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"issokxuo","id":"1(@)"} +{"time":"2025-12-07T10:39:35.432661696Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T10:39:35.432720654Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T10:39:35.432710717Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T10:39:35.432765287Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T10:39:35.43283176Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3794655-3794898-1742497710/socket","Net":"unix"}} +{"time":"2025-12-07T10:39:35.930654265Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T10:39:35.930678334Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T10:39:35.93069107Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-internal.log b/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..18355cc9d8d043be65eba8ef49a2f45bed09403f --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T10:38:25.70892631Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T10:38:25.875978038Z","level":"INFO","msg":"stream: created new stream","id":"issokxuo"} +{"time":"2025-12-07T10:38:25.876226531Z","level":"INFO","msg":"handler: started","stream_id":"issokxuo"} +{"time":"2025-12-07T10:38:25.876358861Z","level":"INFO","msg":"stream: started","id":"issokxuo"} +{"time":"2025-12-07T10:38:25.876367804Z","level":"INFO","msg":"writer: started","stream_id":"issokxuo"} +{"time":"2025-12-07T10:38:25.876405839Z","level":"INFO","msg":"sender: started","stream_id":"issokxuo"} +{"time":"2025-12-07T10:39:35.432726459Z","level":"INFO","msg":"stream: closing","id":"issokxuo"} +{"time":"2025-12-07T10:39:35.788975189Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T10:39:35.92749075Z","level":"INFO","msg":"handler: closed","stream_id":"issokxuo"} +{"time":"2025-12-07T10:39:35.927607405Z","level":"INFO","msg":"sender: closed","stream_id":"issokxuo"} +{"time":"2025-12-07T10:39:35.927615114Z","level":"INFO","msg":"stream: closed","id":"issokxuo"} diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug.log b/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..910ace7e3f1a3a9a89746c9523ee03be72c4ac63 --- /dev/null +++ b/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_setup.py:_flush():80] Configure stats pid to 3794655 +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug.log +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_103825-issokxuo/logs/debug-internal.log +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_init.py:init():841] calling init triggers +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 10:38:25,450 INFO MainThread:3794655 [wandb_init.py:init():889] starting backend +2025-12-07 10:38:25,702 INFO MainThread:3794655 [wandb_init.py:init():892] sending inform_init request +2025-12-07 10:38:25,707 INFO MainThread:3794655 [wandb_init.py:init():900] backend started and connected +2025-12-07 10:38:25,708 INFO MainThread:3794655 [wandb_init.py:init():970] updated telemetry +2025-12-07 10:38:25,712 INFO MainThread:3794655 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 10:38:26,250 INFO MainThread:3794655 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 10:38:26,361 INFO MainThread:3794655 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 10:38:26,361 INFO MainThread:3794655 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 10:38:26,361 INFO MainThread:3794655 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 10:38:26,361 INFO MainThread:3794655 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 10:38:26,364 INFO MainThread:3794655 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 10:38:26,365 INFO MainThread:3794655 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 10:39:35,432 INFO wandb-AsyncioManager-main:3794655 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 10:39:35,432 INFO wandb-AsyncioManager-main:3794655 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_103825-issokxuo/run-issokxuo.wandb b/Meissonic/wandb/run-20251207_103825-issokxuo/run-issokxuo.wandb new file mode 100644 index 0000000000000000000000000000000000000000..0a6ccd6ba2512e95502251629c7156e1dc69ddb8 Binary files /dev/null and b/Meissonic/wandb/run-20251207_103825-issokxuo/run-issokxuo.wandb differ diff --git a/Meissonic/wandb/run-20251207_104819-tswns3jx/files/output.log b/Meissonic/wandb/run-20251207_104819-tswns3jx/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..c427e00c611fbeac0c401b717809b25c162c1b42 --- /dev/null +++ b/Meissonic/wandb/run-20251207_104819-tswns3jx/files/output.log @@ -0,0 +1,314 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10155.70it/s] +12/07/2025 10:48:27 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 10:48:27 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 10:48:27 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:48:28 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 10:48:43 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:48:45 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768])., continuing with random initialization +12/07/2025 10:48:45 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 10:48:52 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 10:48:52 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 10:48:52 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 10:48:57 - INFO - __main__ - ***** Running training ***** +12/07/2025 10:48:57 - INFO - __main__ - Num training steps = 10000 +12/07/2025 10:48:57 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 10:48:57 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 10:48:57 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 10:49:30 - INFO - __main__ - Step: 10 Loss: 11.1010 LR: 0.000300 +12/07/2025 10:49:30 - INFO - __main__ - Generating videos for validation... +12/07/2025 10:49:30 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7044.18it/s] +12/07/2025 10:53:20 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 10:53:20 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 10:53:20 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:53:20 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 10:53:35 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:53:35 - WARNING - __main__ - Failed to load Wan pretrained weights: Could not find state dict in Wan-AI/Wan2.1-T2V-1.3B +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 606, in main + raise FileNotFoundError(f"Could not find state dict in {args.wan_pretrained_path}") +FileNotFoundError: Could not find state dict in Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 10:53:35 - WARNING - __main__ - Continuing with random initialization +12/07/2025 10:53:35 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 10:53:43 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 10:53:43 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 10:53:43 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 10:53:48 - INFO - __main__ - ***** Running training ***** +12/07/2025 10:53:48 - INFO - __main__ - Num training steps = 10000 +12/07/2025 10:53:48 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 10:53:48 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 10:53:48 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 10:54:23 - INFO - __main__ - Step: 10 Loss: 11.1033 LR: 0.000300 +12/07/2025 10:54:23 - INFO - __main__ - Generating videos for validation... +12/07/2025 10:54:23 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00 + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 953, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 948, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 649, in forward + sinusoidal_embedding_1d(self.freq_dim, + File "/mnt/Meissonic/src/transformer_video.py", line 205, in sinusoidal_embedding_1d + position, torch.pow(10000, -torch.arange(half).to(position).div(half))) +torch.AcceleratorError: CUDA error: an illegal memory access was encountered +Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. +CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +For debugging consider passing CUDA_LAUNCH_BLOCKING=1 +Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. + +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1235, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 953, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 948, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 649, in forward +[rank0]: sinusoidal_embedding_1d(self.freq_dim, +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 205, in sinusoidal_embedding_1d +[rank0]: position, torch.pow(10000, -torch.arange(half).to(position).div(half))) +[rank0]: torch.AcceleratorError: CUDA error: an illegal memory access was encountered +[rank0]: Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. +[rank0]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. +[rank0]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 +[rank0]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/requirements.txt b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..81b0109e13836a733aad421e3c32314188ff224a --- /dev/null +++ b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T10:53:12.527306Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716175732736" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "6g3pak4wzliukydp7zdvd974sucm3b2d" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/wandb-summary.json b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d998676e7d1dfbd171237ecb1b26158b17cb52c7 --- /dev/null +++ b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/files/wandb-summary.json @@ -0,0 +1 @@ +{"lr":0.0003,"avg_masking_rate":0.45858341455459595,"_timestamp":1.7651048638068483e+09,"_runtime":71.916017338,"_wandb":{"runtime":71},"_step":10,"step_loss":11.103349685668945} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-core.log b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..54410d233ec4c6e6d2496b4554406d4b86806bac --- /dev/null +++ b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T10:53:12.598190067Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp4siuqzlw/port-3828474.txt","pid":3828474,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T10:53:12.598671365Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3828474} +{"time":"2025-12-07T10:53:12.598685726Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3828474-3828789-3176264225/socket","Net":"unix"}} +{"time":"2025-12-07T10:53:12.784534583Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T10:53:12.79077906Z","level":"INFO","msg":"handleInformInit: received","streamId":"tr2qiwbt","id":"1(@)"} +{"time":"2025-12-07T10:53:12.955952996Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tr2qiwbt","id":"1(@)"} +{"time":"2025-12-07T10:54:25.165824225Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T10:54:25.165901787Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T10:54:25.165888985Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T10:54:25.166018647Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T10:54:25.165985957Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3828474-3828789-3176264225/socket","Net":"unix"}} +{"time":"2025-12-07T10:54:25.690392584Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T10:54:25.690412694Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T10:54:25.690421794Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-internal.log b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a4664965c64e4a217c391a07c50f45ddb9efdd70 --- /dev/null +++ b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T10:53:12.790917926Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T10:53:12.955706909Z","level":"INFO","msg":"stream: created new stream","id":"tr2qiwbt"} +{"time":"2025-12-07T10:53:12.955811909Z","level":"INFO","msg":"handler: started","stream_id":"tr2qiwbt"} +{"time":"2025-12-07T10:53:12.955944475Z","level":"INFO","msg":"stream: started","id":"tr2qiwbt"} +{"time":"2025-12-07T10:53:12.955955108Z","level":"INFO","msg":"writer: started","stream_id":"tr2qiwbt"} +{"time":"2025-12-07T10:53:12.955962108Z","level":"INFO","msg":"sender: started","stream_id":"tr2qiwbt"} +{"time":"2025-12-07T10:54:25.165911258Z","level":"INFO","msg":"stream: closing","id":"tr2qiwbt"} +{"time":"2025-12-07T10:54:25.552800258Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T10:54:25.686991267Z","level":"INFO","msg":"handler: closed","stream_id":"tr2qiwbt"} +{"time":"2025-12-07T10:54:25.687069344Z","level":"INFO","msg":"sender: closed","stream_id":"tr2qiwbt"} +{"time":"2025-12-07T10:54:25.6870763Z","level":"INFO","msg":"stream: closed","id":"tr2qiwbt"} diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug.log b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..408b963b813e4c3471b17521ca6fc8fa48d0f64d --- /dev/null +++ b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_setup.py:_flush():80] Configure stats pid to 3828474 +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug.log +2025-12-07 10:53:12,530 INFO MainThread:3828474 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_105312-tr2qiwbt/logs/debug-internal.log +2025-12-07 10:53:12,531 INFO MainThread:3828474 [wandb_init.py:init():841] calling init triggers +2025-12-07 10:53:12,531 INFO MainThread:3828474 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 10:53:12,531 INFO MainThread:3828474 [wandb_init.py:init():889] starting backend +2025-12-07 10:53:12,784 INFO MainThread:3828474 [wandb_init.py:init():892] sending inform_init request +2025-12-07 10:53:12,789 INFO MainThread:3828474 [wandb_init.py:init():900] backend started and connected +2025-12-07 10:53:12,791 INFO MainThread:3828474 [wandb_init.py:init():970] updated telemetry +2025-12-07 10:53:12,795 INFO MainThread:3828474 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 10:53:13,249 INFO MainThread:3828474 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 10:53:13,359 INFO MainThread:3828474 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 10:53:13,359 INFO MainThread:3828474 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 10:53:13,359 INFO MainThread:3828474 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 10:53:13,359 INFO MainThread:3828474 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 10:53:13,362 INFO MainThread:3828474 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 10:53:13,363 INFO MainThread:3828474 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 10:54:25,165 INFO wandb-AsyncioManager-main:3828474 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 10:54:25,166 INFO wandb-AsyncioManager-main:3828474 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_105312-tr2qiwbt/run-tr2qiwbt.wandb b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/run-tr2qiwbt.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1e6c723f189afd3412970bc782613440bd91fb75 Binary files /dev/null and b/Meissonic/wandb/run-20251207_105312-tr2qiwbt/run-tr2qiwbt.wandb differ diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/config.yaml b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..230dda3a8395a1954ef2011c0cdf482d9254a13f --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + iamstk2h1hf34ugp4q80ww9pxd9navmp: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "10" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716175966208" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T11:07:39.438261Z" + writerId: iamstk2h1hf34ugp4q80ww9pxd9navmp + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 10 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/output.log b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fc061c5b01c018666e136fe18380bf68c9249e66 --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/output.log @@ -0,0 +1,324 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6304.52it/s] +12/07/2025 11:07:47 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:07:47 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:07:47 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:07:47 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:08:02 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:08:02 - WARNING - __main__ - Failed to load Wan pretrained weights: Could not find state dict in Wan-AI/Wan2.1-T2V-1.3B +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 606, in main + raise FileNotFoundError(f"Could not find state dict in {args.wan_pretrained_path}") +FileNotFoundError: Could not find state dict in Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:08:02 - WARNING - __main__ - Continuing with random initialization +12/07/2025 11:08:04 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:08:11 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:08:11 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:08:11 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:08:15 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:08:15 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:08:15 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:08:15 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:08:15 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:08:47 - INFO - __main__ - Step: 10 Loss: 11.1051 LR: 0.000300 +12/07/2025 11:08:47 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:08:48 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00 + first_frame = video[0] + File "/mnt/Meissonic/train/train_mei_video.py", line 974, in main + weight_dtype = torch.float16 + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ + next_batch = next(dataloader_iter) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ + data = self._next_data() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/mnt/Meissonic/train/dataset_utils.py", line 611, in __getitem__ + video_tensor = self._load_video(video_path) + File "/mnt/Meissonic/train/dataset_utils.py", line 597, in _load_video + return self._load_video_decord(full_path) + File "/mnt/Meissonic/train/dataset_utils.py", line 474, in _load_video_decord + frames = frames.float() # [F, H, W, C] +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1252, in +[rank0]: first_frame = video[0] +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 974, in main +[rank0]: weight_dtype = torch.float16 +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ +[rank0]: next_batch = next(dataloader_iter) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ +[rank0]: data = self._next_data() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data +[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 611, in __getitem__ +[rank0]: video_tensor = self._load_video(video_path) +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 597, in _load_video +[rank0]: return self._load_video_decord(full_path) +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 474, in _load_video_decord +[rank0]: frames = frames.float() # [F, H, W, C] +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/requirements.txt b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2493bc7209af2ae43af93f208906a9cc8028f849 --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:07:39.438261Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716175966208" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "iamstk2h1hf34ugp4q80ww9pxd9navmp" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/wandb-summary.json b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ae588ea4aaa2b3f81230765f6a473259e4335964 --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":201},"_runtime":201.361041497,"avg_masking_rate":0.6001749634742737,"_timestamp":1.7651058554341211e+09,"_step":60,"step_loss":11.095102310180664,"lr":0.0003} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-core.log b/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..535f7011db40258254ac37bed7d9d00c64ecf60f --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T11:07:39.517839269Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpigys55qr/port-3839204.txt","pid":3839204,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:07:39.5184867Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3839204} +{"time":"2025-12-07T11:07:39.518466907Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3839204-3839450-2318571528/socket","Net":"unix"}} +{"time":"2025-12-07T11:07:39.704457068Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:07:39.711824917Z","level":"INFO","msg":"handleInformInit: received","streamId":"c8fv99fg","id":"1(@)"} +{"time":"2025-12-07T11:07:39.876019154Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"c8fv99fg","id":"1(@)"} +{"time":"2025-12-07T11:11:01.47491143Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:11:01.474962856Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:11:01.474954815Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:11:01.475086179Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:11:01.475076565Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3839204-3839450-2318571528/socket","Net":"unix"}} +{"time":"2025-12-07T11:11:01.918526535Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T11:11:01.918551823Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T11:11:01.918564987Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-internal.log b/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3d61a83b2b086978242867443e19f1c6c6603612 --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T11:07:39.711945258Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:07:39.875768008Z","level":"INFO","msg":"stream: created new stream","id":"c8fv99fg"} +{"time":"2025-12-07T11:07:39.875878373Z","level":"INFO","msg":"handler: started","stream_id":"c8fv99fg"} +{"time":"2025-12-07T11:07:39.8760112Z","level":"INFO","msg":"stream: started","id":"c8fv99fg"} +{"time":"2025-12-07T11:07:39.876028206Z","level":"INFO","msg":"sender: started","stream_id":"c8fv99fg"} +{"time":"2025-12-07T11:07:39.876028892Z","level":"INFO","msg":"writer: started","stream_id":"c8fv99fg"} +{"time":"2025-12-07T11:11:01.47498487Z","level":"INFO","msg":"stream: closing","id":"c8fv99fg"} +{"time":"2025-12-07T11:11:01.734581776Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T11:11:01.915108167Z","level":"INFO","msg":"handler: closed","stream_id":"c8fv99fg"} +{"time":"2025-12-07T11:11:01.915220178Z","level":"INFO","msg":"sender: closed","stream_id":"c8fv99fg"} +{"time":"2025-12-07T11:11:01.915241328Z","level":"INFO","msg":"stream: closed","id":"c8fv99fg"} diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug.log b/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5432ef3bf5a87f5c6db850cfab71898d4ffe78d1 --- /dev/null +++ b/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_setup.py:_flush():80] Configure stats pid to 3839204 +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug.log +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_110739-c8fv99fg/logs/debug-internal.log +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:07:39,441 INFO MainThread:3839204 [wandb_init.py:init():889] starting backend +2025-12-07 11:07:39,704 INFO MainThread:3839204 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:07:39,710 INFO MainThread:3839204 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:07:39,713 INFO MainThread:3839204 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:07:39,719 INFO MainThread:3839204 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:07:40,113 INFO MainThread:3839204 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:07:40,288 INFO MainThread:3839204 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:07:40,288 INFO MainThread:3839204 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:07:40,288 INFO MainThread:3839204 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:07:40,289 INFO MainThread:3839204 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:07:40,291 INFO MainThread:3839204 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:07:40,294 INFO MainThread:3839204 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:11:01,475 INFO wandb-AsyncioManager-main:3839204 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:11:01,475 INFO wandb-AsyncioManager-main:3839204 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_110739-c8fv99fg/run-c8fv99fg.wandb b/Meissonic/wandb/run-20251207_110739-c8fv99fg/run-c8fv99fg.wandb new file mode 100644 index 0000000000000000000000000000000000000000..13786688ac8269490b32128e17b1570505c5dca4 Binary files /dev/null and b/Meissonic/wandb/run-20251207_110739-c8fv99fg/run-c8fv99fg.wandb differ diff --git a/Meissonic/wandb/run-20251207_111118-qo5llcgp/files/output.log b/Meissonic/wandb/run-20251207_111118-qo5llcgp/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2b0b0fdb98667b0965defe4f84a0d1a7a5735014 --- /dev/null +++ b/Meissonic/wandb/run-20251207_111118-qo5llcgp/files/output.log @@ -0,0 +1,113 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7206.71it/s] +12/07/2025 11:11:26 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:11:26 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:11:26 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:11:26 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:11:42 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:11:42 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:11:43 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 697, in main + missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2629, in load_state_dict + raise RuntimeError( +RuntimeError: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +12/07/2025 11:11:43 - WARNING - __main__ - Continuing with random initialization +12/07/2025 11:11:45 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:11:52 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:11:52 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:11:52 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:11:57 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:11:57 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:11:57 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:11:57 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:11:57 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:12:25 - INFO - __main__ - Step: 10 Loss: 11.1015 LR: 0.000300 +12/07/2025 11:12:25 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:12:25 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5307.33it/s] +12/07/2025 11:15:25 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:15:25 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:15:25 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:15:26 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:15:41 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:15:41 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:15:43 - WARNING - __main__ - Failed to load Wan pretrained weights: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 697, in main + missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2629, in load_state_dict + raise RuntimeError( +RuntimeError: Error(s) in loading state_dict for WanModel: + size mismatch for text_embedding.0.weight: copying a param with shape torch.Size([1536, 4096]) from checkpoint, the shape in current model is torch.Size([1536, 768]). +12/07/2025 11:15:43 - WARNING - __main__ - Continuing with random initialization +12/07/2025 11:15:44 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:15:52 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:15:52 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:15:52 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:15:57 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:15:57 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:15:57 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:15:57 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:15:57 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:16:26 - INFO - __main__ - Step: 10 Loss: 11.1020 LR: 0.000300 +12/07/2025 11:16:26 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:16:26 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00 + accelerator.end_training() + File "/mnt/Meissonic/train/train_mei_video.py", line 1166, in main + reduction="mean", + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1318, in +[rank0]: accelerator.end_training() +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1166, in main +[rank0]: reduction="mean", +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/files/requirements.txt b/Meissonic/wandb/run-20251207_111518-slrbepi0/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_111518-slrbepi0/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..bb45acc2390e9d0441450deeb01f40bd99e33c47 --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:15:18.058190Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716176494592" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "kd83mi5rzpvcfa163dq5rf6u1glhcjp9" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/files/wandb-summary.json b/Meissonic/wandb/run-20251207_111518-slrbepi0/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f44cc64f6325c3ee6bd67c7e5f886440fe552bb2 --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":251},"_runtime":251.287016901,"_step":80,"step_loss":11.093503952026367,"lr":0.0003,"avg_masking_rate":0.8475728034973145,"_timestamp":1.7651063494351544e+09} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-core.log b/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..868ef594f340a2b1bcf56e62a85e84e2814a332b --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T11:15:18.12560228Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpab93sb5c/port-3869225.txt","pid":3869225,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:15:18.126159144Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3869225} +{"time":"2025-12-07T11:15:18.126139576Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3869225-3869517-1551466808/socket","Net":"unix"}} +{"time":"2025-12-07T11:15:18.311655035Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:15:18.317760189Z","level":"INFO","msg":"handleInformInit: received","streamId":"slrbepi0","id":"1(@)"} +{"time":"2025-12-07T11:15:18.488394316Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"slrbepi0","id":"1(@)"} +{"time":"2025-12-07T11:19:30.017462329Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:19:30.017536213Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:19:30.017525938Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:19:30.017609265Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:19:30.017652007Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3869225-3869517-1551466808/socket","Net":"unix"}} +{"time":"2025-12-07T11:19:30.472111226Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T11:19:30.472137344Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T11:19:30.472149074Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-internal.log b/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..914729f6a9e1f4e3f4116e7b3a1b95f83654ccfb --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T11:15:18.317920493Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:15:18.48813955Z","level":"INFO","msg":"stream: created new stream","id":"slrbepi0"} +{"time":"2025-12-07T11:15:18.488248739Z","level":"INFO","msg":"handler: started","stream_id":"slrbepi0"} +{"time":"2025-12-07T11:15:18.48838537Z","level":"INFO","msg":"stream: started","id":"slrbepi0"} +{"time":"2025-12-07T11:15:18.488396702Z","level":"INFO","msg":"writer: started","stream_id":"slrbepi0"} +{"time":"2025-12-07T11:15:18.488403334Z","level":"INFO","msg":"sender: started","stream_id":"slrbepi0"} +{"time":"2025-12-07T11:19:30.01754506Z","level":"INFO","msg":"stream: closing","id":"slrbepi0"} +{"time":"2025-12-07T11:19:30.328391947Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T11:19:30.468945883Z","level":"INFO","msg":"handler: closed","stream_id":"slrbepi0"} +{"time":"2025-12-07T11:19:30.469063742Z","level":"INFO","msg":"sender: closed","stream_id":"slrbepi0"} +{"time":"2025-12-07T11:19:30.469088818Z","level":"INFO","msg":"stream: closed","id":"slrbepi0"} diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug.log b/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4c29bffa5e375fa6fa46663a1322faf8814e17bf --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_setup.py:_flush():80] Configure stats pid to 3869225 +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug.log +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_111518-slrbepi0/logs/debug-internal.log +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:15:18,061 INFO MainThread:3869225 [wandb_init.py:init():889] starting backend +2025-12-07 11:15:18,311 INFO MainThread:3869225 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:15:18,316 INFO MainThread:3869225 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:15:18,317 INFO MainThread:3869225 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:15:18,321 INFO MainThread:3869225 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:15:18,729 INFO MainThread:3869225 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:15:18,845 INFO MainThread:3869225 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:15:18,845 INFO MainThread:3869225 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:15:18,845 INFO MainThread:3869225 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:15:18,845 INFO MainThread:3869225 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:15:18,847 INFO MainThread:3869225 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:15:18,848 INFO MainThread:3869225 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:19:30,017 INFO wandb-AsyncioManager-main:3869225 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:19:30,017 INFO wandb-AsyncioManager-main:3869225 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_111518-slrbepi0/run-slrbepi0.wandb b/Meissonic/wandb/run-20251207_111518-slrbepi0/run-slrbepi0.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7f2ff72da2c64a1f5097be140fe7c360bb575fab --- /dev/null +++ b/Meissonic/wandb/run-20251207_111518-slrbepi0/run-slrbepi0.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:717e2148b6c79cbdfdbba8e3015131a2a84ca1420c20115b8633f5743ac10bd2 +size 119050 diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/files/config.yaml b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e79f244e0ffd7e8528a1a0effc05d1f1a2881b25 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 1idbsopmjtvvium0a9bmdqpdc585e7nj: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "10" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716176875520" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T11:20:22.715179Z" + writerId: 1idbsopmjtvvium0a9bmdqpdc585e7nj + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 10 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/files/output.log b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5772e111db5c2aa3c3aced6edf076202afaf901b --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/output.log @@ -0,0 +1,118 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5615.94it/s] +12/07/2025 11:20:30 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:20:30 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:20:30 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:20:30 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:20:46 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:20:46 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:20:46 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:20:47 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:20:47 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:20:49 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:20:56 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:20:56 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:20:56 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:21:00 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:21:00 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:21:00 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:21:00 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:21:00 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:21:29 - INFO - __main__ - Step: 10 Loss: 11.1047 LR: 0.000300 +12/07/2025 11:21:29 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:21:29 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00 + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main + for batch in train_dataloader: + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ + next_batch = next(dataloader_iter) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ + data = self._next_data() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/mnt/Meissonic/train/dataset_utils.py", line 618, in __getitem__ + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) + File "/mnt/Meissonic/train/dataset_utils.py", line 305, in process_video + video_tensor = torch.nn.functional.interpolate( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 4768, in interpolate + return torch._C._nn.upsample_bilinear2d( +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main +[rank0]: for batch in train_dataloader: +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ +[rank0]: next_batch = next(dataloader_iter) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ +[rank0]: data = self._next_data() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data +[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 618, in __getitem__ +[rank0]: video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 305, in process_video +[rank0]: video_tensor = torch.nn.functional.interpolate( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 4768, in interpolate +[rank0]: return torch._C._nn.upsample_bilinear2d( +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/files/requirements.txt b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..dae49b24d59c4889c998bc25d70b51399ad917bf --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:20:22.715179Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716176875520" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "1idbsopmjtvvium0a9bmdqpdc585e7nj" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/files/wandb-summary.json b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..fcdfe1306b3df24901cdf6ef13b1ca5932661de1 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/files/wandb-summary.json @@ -0,0 +1 @@ +{"step_loss":11.102934837341309,"lr":0.0003,"_wandb":{"runtime":93},"_runtime":93.204378723,"avg_masking_rate":0.7513207197189331,"_timestamp":1.7651065139582825e+09,"_step":20} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-core.log b/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2cf4a236744c088f3dbb1ae42605a2eb46d61420 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T11:20:22.785463123Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmppwkk7n57/port-3895473.txt","pid":3895473,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:20:22.786060377Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3895473} +{"time":"2025-12-07T11:20:22.786035354Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3895473-3895721-795491129/socket","Net":"unix"}} +{"time":"2025-12-07T11:20:22.97127649Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:20:22.977463885Z","level":"INFO","msg":"handleInformInit: received","streamId":"0cuseww8","id":"1(@)"} +{"time":"2025-12-07T11:20:23.14627883Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"0cuseww8","id":"1(@)"} +{"time":"2025-12-07T11:21:56.548196392Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:21:56.548278751Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:21:56.548346443Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:21:56.548303098Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:21:56.54846317Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3895473-3895721-795491129/socket","Net":"unix"}} +{"time":"2025-12-07T11:21:56.897755703Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T11:21:56.897780474Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T11:21:56.897790066Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-internal.log b/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5aae84a9f82bd4b1358bbfef2cab62f2e396ab4e --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T11:20:22.977605376Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:20:23.146011141Z","level":"INFO","msg":"stream: created new stream","id":"0cuseww8"} +{"time":"2025-12-07T11:20:23.146110903Z","level":"INFO","msg":"handler: started","stream_id":"0cuseww8"} +{"time":"2025-12-07T11:20:23.146252757Z","level":"INFO","msg":"stream: started","id":"0cuseww8"} +{"time":"2025-12-07T11:20:23.146275806Z","level":"INFO","msg":"writer: started","stream_id":"0cuseww8"} +{"time":"2025-12-07T11:20:23.146295597Z","level":"INFO","msg":"sender: started","stream_id":"0cuseww8"} +{"time":"2025-12-07T11:21:56.548306758Z","level":"INFO","msg":"stream: closing","id":"0cuseww8"} +{"time":"2025-12-07T11:21:56.785271617Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T11:21:56.894677028Z","level":"INFO","msg":"handler: closed","stream_id":"0cuseww8"} +{"time":"2025-12-07T11:21:56.894755859Z","level":"INFO","msg":"sender: closed","stream_id":"0cuseww8"} +{"time":"2025-12-07T11:21:56.894765451Z","level":"INFO","msg":"stream: closed","id":"0cuseww8"} diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug.log b/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b991716816a00786d2f8527b1ed239d1995afdca --- /dev/null +++ b/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_setup.py:_flush():80] Configure stats pid to 3895473 +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug.log +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_112022-0cuseww8/logs/debug-internal.log +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:20:22,718 INFO MainThread:3895473 [wandb_init.py:init():889] starting backend +2025-12-07 11:20:22,971 INFO MainThread:3895473 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:20:22,975 INFO MainThread:3895473 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:20:22,977 INFO MainThread:3895473 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:20:22,981 INFO MainThread:3895473 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:20:23,343 INFO MainThread:3895473 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:20:23,454 INFO MainThread:3895473 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:20:23,454 INFO MainThread:3895473 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:20:23,454 INFO MainThread:3895473 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:20:23,454 INFO MainThread:3895473 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:20:23,457 INFO MainThread:3895473 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:20:23,457 INFO MainThread:3895473 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:21:56,548 INFO wandb-AsyncioManager-main:3895473 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:21:56,548 INFO wandb-AsyncioManager-main:3895473 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_112022-0cuseww8/run-0cuseww8.wandb b/Meissonic/wandb/run-20251207_112022-0cuseww8/run-0cuseww8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..db1715fc161a099842483f3fb8b42038e310d125 Binary files /dev/null and b/Meissonic/wandb/run-20251207_112022-0cuseww8/run-0cuseww8.wandb differ diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/config.yaml b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..53cb410ffe972010f7c37305d52fe478a8efd8d7 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 2cqg2983w9j3j0ixehtv587tdowv06d6: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "10" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716177002496" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T11:23:12.368744Z" + writerId: 2cqg2983w9j3j0ixehtv587tdowv06d6 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 10 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/output.log b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6d09e745c2d329388d665acaa37887a8b30077aa --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/output.log @@ -0,0 +1,73 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9760.68it/s] +12/07/2025 11:23:20 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:23:20 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:23:20 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:23:20 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:23:36 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:23:36 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:23:36 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:23:38 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:23:38 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:23:39 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:23:46 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:23:46 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:23:46 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:23:52 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:23:52 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:23:52 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:23:52 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:23:52 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:24:22 - INFO - __main__ - Step: 10 Loss: 11.1046 LR: 0.000300 +12/07/2025 11:24:22 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:24:22 - INFO - __main__ - Generating videos for validation... + 0%| | 0/48 [00:00 + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main + for batch in train_dataloader: + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ + next_batch = next(dataloader_iter) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ + data = self._next_data() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/mnt/Meissonic/train/dataset_utils.py", line 618, in __getitem__ + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) + File "/mnt/Meissonic/train/dataset_utils.py", line 289, in process_video + if video_tensor.max() > 1.0: +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main +[rank0]: for batch in train_dataloader: +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ +[rank0]: next_batch = next(dataloader_iter) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ +[rank0]: data = self._next_data() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data +[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 618, in __getitem__ +[rank0]: video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 289, in process_video +[rank0]: if video_tensor.max() > 1.0: +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/requirements.txt b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..900f0486cecb3bafcbb2fb14dc3b82b047708b93 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:23:12.368744Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716177002496" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "2cqg2983w9j3j0ixehtv587tdowv06d6" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/wandb-summary.json b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..da7c4f96e9bd1b1f14321fdd8c0cb85bcb658c33 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/files/wandb-summary.json @@ -0,0 +1 @@ +{"_timestamp":1.7651066620913393e+09,"_step":10,"_wandb":{"runtime":75},"_runtime":75.998973295,"step_loss":11.10457706451416,"lr":0.0003,"avg_masking_rate":0.45858341455459595} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-core.log b/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ee21183b298966cbda121e56f252c2290a41367e --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T11:23:12.437999516Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpme82jobd/port-3903847.txt","pid":3903847,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:23:12.438523342Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3903847} +{"time":"2025-12-07T11:23:12.438506797Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3903847-3904086-2511465429/socket","Net":"unix"}} +{"time":"2025-12-07T11:23:12.624454045Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:23:12.630584178Z","level":"INFO","msg":"handleInformInit: received","streamId":"pnd8l27j","id":"1(@)"} +{"time":"2025-12-07T11:23:12.908024352Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"pnd8l27j","id":"1(@)"} +{"time":"2025-12-07T11:24:29.069388046Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:24:29.069470322Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:24:29.06947842Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:24:29.070455077Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:24:29.071056569Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3903847-3904086-2511465429/socket","Net":"unix"}} +{"time":"2025-12-07T11:24:29.492068684Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T11:24:29.492103205Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T11:24:29.492114563Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-internal.log b/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..83a9c3bb396ad5570504a9db6a0d8eac5dda8e2a --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T11:23:12.630694921Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:23:12.907791181Z","level":"INFO","msg":"stream: created new stream","id":"pnd8l27j"} +{"time":"2025-12-07T11:23:12.907917345Z","level":"INFO","msg":"handler: started","stream_id":"pnd8l27j"} +{"time":"2025-12-07T11:23:12.908015373Z","level":"INFO","msg":"stream: started","id":"pnd8l27j"} +{"time":"2025-12-07T11:23:12.908025737Z","level":"INFO","msg":"writer: started","stream_id":"pnd8l27j"} +{"time":"2025-12-07T11:23:12.908038811Z","level":"INFO","msg":"sender: started","stream_id":"pnd8l27j"} +{"time":"2025-12-07T11:24:29.069489171Z","level":"INFO","msg":"stream: closing","id":"pnd8l27j"} +{"time":"2025-12-07T11:24:29.340717718Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T11:24:29.488868774Z","level":"INFO","msg":"handler: closed","stream_id":"pnd8l27j"} +{"time":"2025-12-07T11:24:29.488967515Z","level":"INFO","msg":"sender: closed","stream_id":"pnd8l27j"} +{"time":"2025-12-07T11:24:29.488977596Z","level":"INFO","msg":"stream: closed","id":"pnd8l27j"} diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug.log b/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5aca9188a265646620d9eb82aada7bb02bc4a204 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_setup.py:_flush():80] Configure stats pid to 3903847 +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug.log +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_112312-pnd8l27j/logs/debug-internal.log +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:23:12,372 INFO MainThread:3903847 [wandb_init.py:init():889] starting backend +2025-12-07 11:23:12,624 INFO MainThread:3903847 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:23:12,629 INFO MainThread:3903847 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:23:12,631 INFO MainThread:3903847 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:23:12,636 INFO MainThread:3903847 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:23:13,070 INFO MainThread:3903847 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:23:13,180 INFO MainThread:3903847 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:23:13,181 INFO MainThread:3903847 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:23:13,181 INFO MainThread:3903847 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:23:13,181 INFO MainThread:3903847 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:23:13,183 INFO MainThread:3903847 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:23:13,184 INFO MainThread:3903847 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:24:29,069 INFO wandb-AsyncioManager-main:3903847 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:24:29,069 INFO wandb-AsyncioManager-main:3903847 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_112312-pnd8l27j/run-pnd8l27j.wandb b/Meissonic/wandb/run-20251207_112312-pnd8l27j/run-pnd8l27j.wandb new file mode 100644 index 0000000000000000000000000000000000000000..c1db50bcd98b584f5008372b44da424a5800116e Binary files /dev/null and b/Meissonic/wandb/run-20251207_112312-pnd8l27j/run-pnd8l27j.wandb differ diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/files/config.yaml b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cd713858dd7a9baa0639bb0e04bd9e4620614656 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/config.yaml @@ -0,0 +1,284 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + mln1dh42vy22ep22bd2q1xtyclc1do93: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "10" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716177182720" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T11:28:23.253224Z" + writerId: mln1dh42vy22ep22bd2q1xtyclc1do93 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 10 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/files/output.log b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..dc5b09e80f923cc9a2ca960e18e28d277f95c0c5 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/output.log @@ -0,0 +1,128 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7047.56it/s] +12/07/2025 11:28:31 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:28:31 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:28:31 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:28:31 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:28:47 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:28:47 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:28:48 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:28:50 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:28:50 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:28:51 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:28:58 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:28:58 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:28:58 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:29:02 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:29:02 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:29:02 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:29:02 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:29:02 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1137, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1018, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1013, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 734, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 458, in forward + assert e.dtype == torch.float32 +AssertionError +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1137, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1018, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1013, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 734, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 458, in forward +[rank0]: assert e.dtype == torch.float32 +[rank0]: AssertionError diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/files/requirements.txt b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..20da3c45275a768a8a6eda8e41c78501a1a79d1e --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:28:23.253224Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716177182720" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "mln1dh42vy22ep22bd2q1xtyclc1do93" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/files/wandb-summary.json b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3e7cda165893749046f710cc0e2b34ef7d7a2597 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":42},"_runtime":42} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-core.log b/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b9ffe9fc357591d8dfc9170d637d6bb8add1c426 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T11:28:23.351595655Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpzos2pua0/port-3911195.txt","pid":3911195,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:28:23.352156693Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3911195} +{"time":"2025-12-07T11:28:23.352171651Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3911195-3911439-744402320/socket","Net":"unix"}} +{"time":"2025-12-07T11:28:23.536406774Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:28:23.54341069Z","level":"INFO","msg":"handleInformInit: received","streamId":"lzb38axz","id":"1(@)"} +{"time":"2025-12-07T11:28:23.712656442Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"lzb38axz","id":"1(@)"} +{"time":"2025-12-07T11:29:06.294246569Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:29:06.294305649Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:29:06.294294702Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:29:06.29441217Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3911195-3911439-744402320/socket","Net":"unix"}} +{"time":"2025-12-07T11:29:06.294456488Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:29:06.706090277Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T11:29:06.706118106Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T11:29:06.706130972Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-internal.log b/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ffc993a5ba2a87e03e1c3ae875066e4601a4e69a --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T11:28:23.544830146Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:28:23.712405173Z","level":"INFO","msg":"stream: created new stream","id":"lzb38axz"} +{"time":"2025-12-07T11:28:23.712512319Z","level":"INFO","msg":"handler: started","stream_id":"lzb38axz"} +{"time":"2025-12-07T11:28:23.71264909Z","level":"INFO","msg":"stream: started","id":"lzb38axz"} +{"time":"2025-12-07T11:28:23.712670096Z","level":"INFO","msg":"writer: started","stream_id":"lzb38axz"} +{"time":"2025-12-07T11:28:23.712671145Z","level":"INFO","msg":"sender: started","stream_id":"lzb38axz"} +{"time":"2025-12-07T11:29:06.29431695Z","level":"INFO","msg":"stream: closing","id":"lzb38axz"} +{"time":"2025-12-07T11:29:06.577747712Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T11:29:06.702786167Z","level":"INFO","msg":"handler: closed","stream_id":"lzb38axz"} +{"time":"2025-12-07T11:29:06.703004219Z","level":"INFO","msg":"sender: closed","stream_id":"lzb38axz"} +{"time":"2025-12-07T11:29:06.703021598Z","level":"INFO","msg":"stream: closed","id":"lzb38axz"} diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug.log b/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..229737b9ad85a490e7b9fb1c1d45ac25fa433e44 --- /dev/null +++ b/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:28:23,256 INFO MainThread:3911195 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:28:23,265 INFO MainThread:3911195 [wandb_setup.py:_flush():80] Configure stats pid to 3911195 +2025-12-07 11:28:23,265 INFO MainThread:3911195 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:28:23,265 INFO MainThread:3911195 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:28:23,265 INFO MainThread:3911195 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:28:23,266 INFO MainThread:3911195 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug.log +2025-12-07 11:28:23,266 INFO MainThread:3911195 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_112823-lzb38axz/logs/debug-internal.log +2025-12-07 11:28:23,266 INFO MainThread:3911195 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:28:23,266 INFO MainThread:3911195 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:28:23,266 INFO MainThread:3911195 [wandb_init.py:init():889] starting backend +2025-12-07 11:28:23,536 INFO MainThread:3911195 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:28:23,541 INFO MainThread:3911195 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:28:23,547 INFO MainThread:3911195 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:28:23,553 INFO MainThread:3911195 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:28:23,926 INFO MainThread:3911195 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:28:24,037 INFO MainThread:3911195 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:28:24,037 INFO MainThread:3911195 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:28:24,037 INFO MainThread:3911195 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:28:24,037 INFO MainThread:3911195 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:28:24,040 INFO MainThread:3911195 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:28:24,041 INFO MainThread:3911195 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:29:06,294 INFO wandb-AsyncioManager-main:3911195 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:29:06,294 INFO wandb-AsyncioManager-main:3911195 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_112823-lzb38axz/run-lzb38axz.wandb b/Meissonic/wandb/run-20251207_112823-lzb38axz/run-lzb38axz.wandb new file mode 100644 index 0000000000000000000000000000000000000000..750d603a2102133ccbf177f2e52dfadc74ee4c71 Binary files /dev/null and b/Meissonic/wandb/run-20251207_112823-lzb38axz/run-lzb38axz.wandb differ diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/config.yaml b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3563457c517f19c45b710dae2af03e7ee173ef78 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/config.yaml @@ -0,0 +1,286 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + c9n47bvs3ik4sgzaydapw3o4clzdv1u6: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "480" + - --video_width + - "848" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "10" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716177289216" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T11:31:03.896671Z" + writerId: c9n47bvs3ik4sgzaydapw3o4clzdv1u6 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 10 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/output.log b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3754ec27c94845336c224e2c5782db12b63b4a48 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/output.log @@ -0,0 +1,129 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7726.35it/s] +12/07/2025 11:31:11 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:31:11 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:31:11 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:31:12 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:31:27 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:31:27 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:31:27 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:31:29 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:31:29 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:31:30 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:31:37 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:31:37 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:31:37 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:31:40 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:31:40 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:31:40 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:31:40 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:31:40 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:32:09 - INFO - __main__ - Step: 10 Loss: 11.1045 LR: 0.000300 +12/07/2025 11:32:09 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:32:09 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.04it/s] +12/07/2025 11:32:17 - ERROR - __main__ - Video validation failed: Got unsupported ScalarType BFloat16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1244, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 801, in __call__ + frame = videos[b, :, f, :, :].cpu().numpy() # [C, H, W] +TypeError: Got unsupported ScalarType BFloat16 +12/07/2025 11:32:40 - INFO - __main__ - Step: 20 Loss: 11.1045 LR: 0.000300 +12/07/2025 11:32:40 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:32:40 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.15it/s] +12/07/2025 11:32:49 - ERROR - __main__ - Video validation failed: Got unsupported ScalarType BFloat16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1244, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 801, in __call__ + frame = videos[b, :, f, :, :].cpu().numpy() # [C, H, W] +TypeError: Got unsupported ScalarType BFloat16 +12/07/2025 11:33:12 - INFO - __main__ - Step: 30 Loss: 11.1155 LR: 0.000300 +12/07/2025 11:33:12 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:33:12 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 6.92it/s] +12/07/2025 11:33:19 - ERROR - __main__ - Video validation failed: Got unsupported ScalarType BFloat16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1244, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 801, in __call__ + frame = videos[b, :, f, :, :].cpu().numpy() # [C, H, W] +TypeError: Got unsupported ScalarType BFloat16 +12/07/2025 11:33:41 - INFO - __main__ - Step: 40 Loss: 11.1310 LR: 0.000300 +12/07/2025 11:33:41 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:33:41 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.19it/s] +12/07/2025 11:33:48 - ERROR - __main__ - Video validation failed: Got unsupported ScalarType BFloat16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1244, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 801, in __call__ + # Convert to float32 before converting to numpy (numpy doesn't support bfloat16) +TypeError: Got unsupported ScalarType BFloat16 +12/07/2025 11:34:09 - INFO - __main__ - Step: 50 Loss: 11.0990 LR: 0.000300 +12/07/2025 11:34:09 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:34:09 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.15it/s] +12/07/2025 11:34:16 - ERROR - __main__ - Video validation failed: Got unsupported ScalarType BFloat16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1244, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 801, in __call__ + # Convert to float32 before converting to numpy (numpy doesn't support bfloat16) +TypeError: Got unsupported ScalarType BFloat16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1178, in main + optimizer.step() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/optimizer.py", line 179, in step + self.optimizer.step(closure) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 133, in wrapper + return func.__get__(opt, opt.__class__)(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 517, in wrapper + out = func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 291, in step + self.update_step(group, p, gindex, pindex) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 504, in update_step + config = self.get_config(gindex, pindex, group) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 306, in get_config + config["alpha"] = group.get("alpha", 0.0) +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1178, in main +[rank0]: optimizer.step() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/optimizer.py", line 179, in step +[rank0]: self.optimizer.step(closure) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 133, in wrapper +[rank0]: return func.__get__(opt, opt.__class__)(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 517, in wrapper +[rank0]: out = func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 291, in step +[rank0]: self.update_step(group, p, gindex, pindex) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 504, in update_step +[rank0]: config = self.get_config(gindex, pindex, group) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 306, in get_config +[rank0]: config["alpha"] = group.get("alpha", 0.0) +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/requirements.txt b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7d7b892364813c84b4df3b922e18ccfa0cfdf806 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:31:03.896671Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716177289216" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "c9n47bvs3ik4sgzaydapw3o4clzdv1u6" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/wandb-summary.json b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b882743a8cb4333267b814035a29fcaff62a4b --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/files/wandb-summary.json @@ -0,0 +1 @@ +{"avg_masking_rate":0.9703603982925415,"_timestamp":1.7651072494390094e+09,"_step":50,"_wandb":{"runtime":196},"_runtime":196.040575818,"step_loss":11.099040031433105,"lr":0.0003} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-core.log b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..30a5653e75d58f0ece71efb6b163cb8838949b9d --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2025-12-07T11:31:03.966047354Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmphp5lvq3q/port-3914409.txt","pid":3914409,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:31:03.966549015Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3914409} +{"time":"2025-12-07T11:31:03.966537063Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3914409-3914707-2981209543/socket","Net":"unix"}} +{"time":"2025-12-07T11:31:04.152181292Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:31:04.158343933Z","level":"INFO","msg":"handleInformInit: received","streamId":"ijl2gw6b","id":"1(@)"} +{"time":"2025-12-07T11:31:04.328788451Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ijl2gw6b","id":"1(@)"} +{"time":"2025-12-07T11:34:20.623338472Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:34:20.623387242Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:34:20.623399867Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:34:20.623438208Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:34:20.623494197Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3914409-3914707-2981209543/socket","Net":"unix"}} +{"time":"2025-12-07T11:34:20.973896027Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-internal.log b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8da2000256d2a9b04cc184c14ea0df5587aec8a0 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2025-12-07T11:31:04.158494463Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:31:04.328521993Z","level":"INFO","msg":"stream: created new stream","id":"ijl2gw6b"} +{"time":"2025-12-07T11:31:04.328634567Z","level":"INFO","msg":"handler: started","stream_id":"ijl2gw6b"} +{"time":"2025-12-07T11:31:04.328779529Z","level":"INFO","msg":"stream: started","id":"ijl2gw6b"} +{"time":"2025-12-07T11:31:04.32878983Z","level":"INFO","msg":"writer: started","stream_id":"ijl2gw6b"} +{"time":"2025-12-07T11:31:04.328809887Z","level":"INFO","msg":"sender: started","stream_id":"ijl2gw6b"} +{"time":"2025-12-07T11:34:20.623401927Z","level":"INFO","msg":"stream: closing","id":"ijl2gw6b"} +{"time":"2025-12-07T11:34:20.869662354Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug.log b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..83f5251a18c2900fce35def1dd13edcf8b36ee97 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_setup.py:_flush():80] Configure stats pid to 3914409 +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug.log +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_113103-ijl2gw6b/logs/debug-internal.log +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:31:03,900 INFO MainThread:3914409 [wandb_init.py:init():889] starting backend +2025-12-07 11:31:04,152 INFO MainThread:3914409 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:31:04,156 INFO MainThread:3914409 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:31:04,157 INFO MainThread:3914409 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:31:04,162 INFO MainThread:3914409 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:31:04,582 INFO MainThread:3914409 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:31:04,691 INFO MainThread:3914409 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:31:04,691 INFO MainThread:3914409 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:31:04,691 INFO MainThread:3914409 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:31:04,691 INFO MainThread:3914409 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:31:04,693 INFO MainThread:3914409 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:31:04,694 INFO MainThread:3914409 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:34:20,623 INFO wandb-AsyncioManager-main:3914409 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:34:20,623 INFO wandb-AsyncioManager-main:3914409 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_113103-ijl2gw6b/run-ijl2gw6b.wandb b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/run-ijl2gw6b.wandb new file mode 100644 index 0000000000000000000000000000000000000000..eda915f9b3920148edebebfef87e40c74b69bd19 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113103-ijl2gw6b/run-ijl2gw6b.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ac8df50a55ca11df3a76cf855f21b49001e571c10c270f7480730cacd7d456 +size 163840 diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_2f39bee6c4969d94f6d2.png b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_2f39bee6c4969d94f6d2.png new file mode 100644 index 0000000000000000000000000000000000000000..2ef2f2aaf539effa7771a87f85c70c4ad2d9994e --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_2f39bee6c4969d94f6d2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f39bee6c4969d94f6d2f21606e7b9e9db5d6b1523ab923157c4f100d2efb5f8 +size 346775 diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_a0ddb52b457bceac4774.png b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_a0ddb52b457bceac4774.png new file mode 100644 index 0000000000000000000000000000000000000000..44cc69e3b8681fcfd5aa3f6aa6e6020d8d098a01 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_a0ddb52b457bceac4774.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0ddb52b457bceac477459b3b363ef6ca755a4f9a7dfc5cb46230ed7bebdb902 +size 319513 diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/files/output.log b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5610fa86236757a62c15f3bea15618f54c3c4c49 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/output.log @@ -0,0 +1,25 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5515.71it/s] +12/07/2025 11:36:14 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=30, W'=53 +12/07/2025 11:36:14 - INFO - __main__ - Theoretical dimensions: F'=2, H'=30, W'=53 +12/07/2025 11:36:14 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:36:15 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:36:30 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:36:30 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:36:30 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:36:32 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:36:32 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:36:33 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:36:40 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:36:40 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:36:40 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:36:45 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:36:45 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:36:45 - INFO - __main__ - Instantaneous batch size per device = 1 +12/07/2025 11:36:45 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/07/2025 11:36:45 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 11:37:15 - INFO - __main__ - Step: 10 Loss: 11.1020 LR: 0.000300 +12/07/2025 11:37:15 - INFO - __main__ - Generating videos for validation... +12/07/2025 11:37:15 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.16it/s] +12/07/2025 11:37:24 - INFO - __main__ - Validation videos saved to ./output diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/files/requirements.txt b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..28ba766bcf83f0eacac7e7ed8c7c70b25bc55138 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:36:07.172130Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "480", + "--video_width", + "848", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "10", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716177752064" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "i7s1dw0ikzbn04iq196avj71dbszartp" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-core.log b/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..fb96d6f78ad155abc35327ce8738bfc2df21433b --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-07T11:36:07.239292027Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpavlp0nyg/port-3931633.txt","pid":3931633,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:36:07.239846512Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3931633} +{"time":"2025-12-07T11:36:07.239820016Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3931633-3931878-4082379513/socket","Net":"unix"}} +{"time":"2025-12-07T11:36:07.425321124Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:36:07.431425061Z","level":"INFO","msg":"handleInformInit: received","streamId":"aryc95f2","id":"1(@)"} +{"time":"2025-12-07T11:36:07.598167522Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"aryc95f2","id":"1(@)"} +{"time":"2025-12-07T11:37:50.473962445Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-internal.log b/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3d4311c697f21dd8813cd2d3bee00b96de055b3a --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-07T11:36:07.431545441Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:36:07.597933784Z","level":"INFO","msg":"stream: created new stream","id":"aryc95f2"} +{"time":"2025-12-07T11:36:07.598047441Z","level":"INFO","msg":"handler: started","stream_id":"aryc95f2"} +{"time":"2025-12-07T11:36:07.598159983Z","level":"INFO","msg":"stream: started","id":"aryc95f2"} +{"time":"2025-12-07T11:36:07.598179248Z","level":"INFO","msg":"sender: started","stream_id":"aryc95f2"} +{"time":"2025-12-07T11:36:07.598178773Z","level":"INFO","msg":"writer: started","stream_id":"aryc95f2"} diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug.log b/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..370be6305aec5185a5d28503fcc5d04574eb3201 --- /dev/null +++ b/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_setup.py:_flush():80] Configure stats pid to 3931633 +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug.log +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_113607-aryc95f2/logs/debug-internal.log +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:36:07,175 INFO MainThread:3931633 [wandb_init.py:init():889] starting backend +2025-12-07 11:36:07,425 INFO MainThread:3931633 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:36:07,429 INFO MainThread:3931633 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:36:07,431 INFO MainThread:3931633 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:36:07,435 INFO MainThread:3931633 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:36:07,853 INFO MainThread:3931633 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:36:07,963 INFO MainThread:3931633 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:36:07,963 INFO MainThread:3931633 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:36:07,963 INFO MainThread:3931633 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:36:07,963 INFO MainThread:3931633 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:36:07,966 INFO MainThread:3931633 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:36:07,966 INFO MainThread:3931633 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 10, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251207_113607-aryc95f2/run-aryc95f2.wandb b/Meissonic/wandb/run-20251207_113607-aryc95f2/run-aryc95f2.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a0ba00b1619a9e67e74e912aa00741ada5768f68 Binary files /dev/null and b/Meissonic/wandb/run-20251207_113607-aryc95f2/run-aryc95f2.wandb differ diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/files/config.yaml b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18d750f2358416f7608d16afb99fb2b51a8d78d9 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/config.yaml @@ -0,0 +1,284 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + t65k1ym7dgsibmh6uuu0ibee7l4cp1t9: + args: + - --text_encoder_architecture + - umt5-base + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "16" + - --video_height + - "256" + - --video_width + - "448" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a dog running + - --output_dir + - ./output + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "500" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11716183887872" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-07T11:40:18.229546Z" + writerId: t65k1ym7dgsibmh6uuu0ibee7l4cp1t9 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 0 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-base +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a dog running +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 448 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/files/output.log b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1925bb725e8249b0369eefc354de5682e6991b7c --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/output.log @@ -0,0 +1,60 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5361.60it/s] +12/07/2025 11:40:25 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=16, W'=28 +12/07/2025 11:40:25 - INFO - __main__ - Theoretical dimensions: F'=2, H'=16, W'=28 +12/07/2025 11:40:25 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:40:26 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:40:41 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:40:41 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:40:41 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:40:43 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:40:43 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:40:44 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:40:51 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:40:51 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:40:51 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:40:55 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:40:55 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:40:55 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 11:40:55 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/07/2025 11:40:55 - INFO - __main__ - Gradient Accumulation steps = 4 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main + for batch in train_dataloader: + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ + next_batch = next(dataloader_iter) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ + data = self._next_data() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/mnt/Meissonic/train/dataset_utils.py", line 618, in __getitem__ + video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) + File "/mnt/Meissonic/train/dataset_utils.py", line 289, in process_video + if video_tensor.max() > 1.0: +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1325, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1047, in main +[rank0]: for batch in train_dataloader: +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/data_loader.py", line 579, in __iter__ +[rank0]: next_batch = next(dataloader_iter) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 732, in __next__ +[rank0]: data = self._next_data() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 788, in _next_data +[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in +[rank0]: data = [self.dataset[idx] for idx in possibly_batched_index] +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 618, in __getitem__ +[rank0]: video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width) +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 289, in process_video +[rank0]: if video_tensor.max() > 1.0: +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/files/requirements.txt b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6225fecd7c27fd50feb59652b8315a501a81a001 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:40:18.229546Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "256", + "--video_width", + "448", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716183887872" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "t65k1ym7dgsibmh6uuu0ibee7l4cp1t9" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/files/wandb-summary.json b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ba79034c0852fc775623bd803c61ef0d4583f73b --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":230},"_runtime":230} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-core.log b/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2a52495e9352b00da17cffa2c00dd2bbf0bf0e85 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-07T11:40:18.32862155Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmphohl_s8n/port-3940283.txt","pid":3940283,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:40:18.329118059Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3940283} +{"time":"2025-12-07T11:40:18.329128653Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3940283-3940519-1913849857/socket","Net":"unix"}} +{"time":"2025-12-07T11:40:18.51515006Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:40:18.522683298Z","level":"INFO","msg":"handleInformInit: received","streamId":"unqfpii2","id":"1(@)"} +{"time":"2025-12-07T11:40:18.693810848Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"unqfpii2","id":"1(@)"} +{"time":"2025-12-07T11:44:09.254701985Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-07T11:44:09.254887524Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-07T11:44:09.254902655Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-07T11:44:09.254934738Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-07T11:44:09.255035488Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3940283-3940519-1913849857/socket","Net":"unix"}} +{"time":"2025-12-07T11:44:09.666968034Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-07T11:44:09.667004278Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-07T11:44:09.667018576Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-internal.log b/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..702cb957612bcc7e744acd77e775abe0c5afaf19 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-07T11:40:18.522795663Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:40:18.69358065Z","level":"INFO","msg":"stream: created new stream","id":"unqfpii2"} +{"time":"2025-12-07T11:40:18.693668831Z","level":"INFO","msg":"handler: started","stream_id":"unqfpii2"} +{"time":"2025-12-07T11:40:18.693801158Z","level":"INFO","msg":"stream: started","id":"unqfpii2"} +{"time":"2025-12-07T11:40:18.693816118Z","level":"INFO","msg":"sender: started","stream_id":"unqfpii2"} +{"time":"2025-12-07T11:40:18.693823281Z","level":"INFO","msg":"writer: started","stream_id":"unqfpii2"} +{"time":"2025-12-07T11:44:09.254902953Z","level":"INFO","msg":"stream: closing","id":"unqfpii2"} +{"time":"2025-12-07T11:44:09.501197596Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-07T11:44:09.663485633Z","level":"INFO","msg":"handler: closed","stream_id":"unqfpii2"} +{"time":"2025-12-07T11:44:09.663812704Z","level":"INFO","msg":"sender: closed","stream_id":"unqfpii2"} +{"time":"2025-12-07T11:44:09.663835315Z","level":"INFO","msg":"stream: closed","id":"unqfpii2"} diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug.log b/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..06518f508f54b419d48dca754043f01d9ebaa089 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_setup.py:_flush():80] Configure stats pid to 3940283 +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug.log +2025-12-07 11:40:18,234 INFO MainThread:3940283 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_114018-unqfpii2/logs/debug-internal.log +2025-12-07 11:40:18,235 INFO MainThread:3940283 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:40:18,235 INFO MainThread:3940283 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:40:18,235 INFO MainThread:3940283 [wandb_init.py:init():889] starting backend +2025-12-07 11:40:18,515 INFO MainThread:3940283 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:40:18,521 INFO MainThread:3940283 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:40:18,523 INFO MainThread:3940283 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:40:18,529 INFO MainThread:3940283 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:40:18,934 INFO MainThread:3940283 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:40:19,109 INFO MainThread:3940283 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:40:19,109 INFO MainThread:3940283 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:40:19,109 INFO MainThread:3940283 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:40:19,109 INFO MainThread:3940283 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:40:19,112 INFO MainThread:3940283 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:40:19,113 INFO MainThread:3940283 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 0, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-07 11:44:09,254 INFO wandb-AsyncioManager-main:3940283 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-07 11:44:09,255 INFO wandb-AsyncioManager-main:3940283 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251207_114018-unqfpii2/run-unqfpii2.wandb b/Meissonic/wandb/run-20251207_114018-unqfpii2/run-unqfpii2.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9df9ef5e9d78b644c3340e65a4dc05bb1be43494 Binary files /dev/null and b/Meissonic/wandb/run-20251207_114018-unqfpii2/run-unqfpii2.wandb differ diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_1fc345a8cdc18e62468b.png b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_1fc345a8cdc18e62468b.png new file mode 100644 index 0000000000000000000000000000000000000000..fcaecfebdf5f76d6b6328047baecb7e9123573a4 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_1fc345a8cdc18e62468b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc345a8cdc18e62468bc94e0040d6d68e8f5cee84f31e8b5a76809fc6038489 +size 143135 diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_f4b36308698e96e11163.png b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_f4b36308698e96e11163.png new file mode 100644 index 0000000000000000000000000000000000000000..a8018b05c8d8d9b277c303768977fcc2f783a8bb --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_f4b36308698e96e11163.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4b36308698e96e111639ab60f72663a36f846def12fbea1cb0b0e05ae5746d0 +size 146021 diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_0798147230daa742b054.png b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_0798147230daa742b054.png new file mode 100644 index 0000000000000000000000000000000000000000..437885e3de202983115651ab01dc8451e5ed3010 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_0798147230daa742b054.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0798147230daa742b0543d2de20dd80f7d8d6a88156498ab0b5bd665af699819 +size 142782 diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_aed08910c4a8dcdc87f6.png b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_aed08910c4a8dcdc87f6.png new file mode 100644 index 0000000000000000000000000000000000000000..116c2a4672bf425bcdb1f0a2fc7cd1d6d8424ebd --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_aed08910c4a8dcdc87f6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed08910c4a8dcdc87f630c3b3580dd2b1216c9fa6a8623685709e8a1b164434 +size 136697 diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/output.log b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..da387c826ac353cfef600d2f0c74874d78b0e4c2 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/output.log @@ -0,0 +1,142 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6782.20it/s] +12/07/2025 11:44:34 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=16, W'=28 +12/07/2025 11:44:34 - INFO - __main__ - Theoretical dimensions: F'=2, H'=16, W'=28 +12/07/2025 11:44:34 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:44:34 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 11:44:50 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:44:50 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 11:44:50 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 11:44:52 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 11:44:52 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 11:44:53 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 11:45:01 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 11:45:01 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 11:45:01 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 11:45:04 - INFO - __main__ - ***** Running training ***** +12/07/2025 11:45:04 - INFO - __main__ - Num training steps = 10000 +12/07/2025 11:45:04 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 11:45:04 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/07/2025 11:45:04 - INFO - __main__ - Gradient Accumulation steps = 4 +12/07/2025 11:46:27 - INFO - __main__ - Step: 10 Loss: 11.1013 LR: 0.000300 +12/07/2025 11:47:27 - INFO - __main__ - Step: 20 Loss: 11.0968 LR: 0.000300 +12/07/2025 11:48:32 - INFO - __main__ - Step: 30 Loss: 11.0978 LR: 0.000300 +12/07/2025 11:49:50 - INFO - __main__ - Step: 40 Loss: 11.0959 LR: 0.000300 +12/07/2025 11:51:05 - INFO - __main__ - Step: 50 Loss: 11.0931 LR: 0.000300 +12/07/2025 11:52:15 - INFO - __main__ - Step: 60 Loss: 11.0892 LR: 0.000300 +12/07/2025 11:53:30 - INFO - __main__ - Step: 70 Loss: 11.0862 LR: 0.000300 +12/07/2025 11:54:40 - INFO - __main__ - Step: 80 Loss: 11.0748 LR: 0.000300 +12/07/2025 11:55:57 - INFO - __main__ - Step: 90 Loss: 11.0534 LR: 0.000300 +12/07/2025 11:57:10 - INFO - __main__ - Step: 100 Loss: 11.0217 LR: 0.000300 +12/07/2025 11:58:21 - INFO - __main__ - Step: 110 Loss: 10.9794 LR: 0.000300 +12/07/2025 11:59:42 - INFO - __main__ - Step: 120 Loss: 10.9246 LR: 0.000300 +12/07/2025 12:00:53 - INFO - __main__ - Step: 130 Loss: 10.8665 LR: 0.000300 +12/07/2025 12:02:06 - INFO - __main__ - Step: 140 Loss: 10.8102 LR: 0.000300 +12/07/2025 12:03:26 - INFO - __main__ - Step: 150 Loss: 10.7432 LR: 0.000300 +12/07/2025 12:04:35 - INFO - __main__ - Step: 160 Loss: 10.7224 LR: 0.000300 +12/07/2025 12:05:50 - INFO - __main__ - Step: 170 Loss: 10.6470 LR: 0.000300 +12/07/2025 12:07:03 - INFO - __main__ - Step: 180 Loss: 10.6287 LR: 0.000300 +12/07/2025 12:08:09 - INFO - __main__ - Step: 190 Loss: 10.5610 LR: 0.000300 +12/07/2025 12:09:22 - INFO - __main__ - Step: 200 Loss: 10.5024 LR: 0.000300 +12/07/2025 12:10:45 - INFO - __main__ - Step: 210 Loss: 10.4316 LR: 0.000300 +12/07/2025 12:12:00 - INFO - __main__ - Step: 220 Loss: 10.5158 LR: 0.000300 +12/07/2025 12:13:13 - INFO - __main__ - Step: 230 Loss: 10.3492 LR: 0.000300 +12/07/2025 12:14:33 - INFO - __main__ - Step: 240 Loss: 10.3635 LR: 0.000300 +12/07/2025 12:15:38 - INFO - __main__ - Step: 250 Loss: 10.3454 LR: 0.000300 +12/07/2025 12:16:54 - INFO - __main__ - Step: 260 Loss: 10.3692 LR: 0.000300 +12/07/2025 12:18:03 - INFO - __main__ - Step: 270 Loss: 10.1443 LR: 0.000300 +12/07/2025 12:19:22 - INFO - __main__ - Step: 280 Loss: 10.2463 LR: 0.000300 +12/07/2025 12:20:38 - INFO - __main__ - Step: 290 Loss: 10.2848 LR: 0.000300 +12/07/2025 12:21:40 - INFO - __main__ - Step: 300 Loss: 10.2996 LR: 0.000300 +12/07/2025 12:23:05 - INFO - __main__ - Step: 310 Loss: 10.2494 LR: 0.000300 +12/07/2025 12:24:14 - INFO - __main__ - Step: 320 Loss: 10.2331 LR: 0.000300 +12/07/2025 12:25:24 - INFO - __main__ - Step: 330 Loss: 10.2331 LR: 0.000300 +12/07/2025 12:26:37 - INFO - __main__ - Step: 340 Loss: 10.2081 LR: 0.000300 +12/07/2025 12:27:48 - INFO - __main__ - Step: 350 Loss: 10.2847 LR: 0.000300 +12/07/2025 12:29:05 - INFO - __main__ - Step: 360 Loss: 10.3289 LR: 0.000300 +12/07/2025 12:30:20 - INFO - __main__ - Step: 370 Loss: 10.2402 LR: 0.000300 +12/07/2025 12:31:29 - INFO - __main__ - Step: 380 Loss: 10.2864 LR: 0.000300 +12/07/2025 12:32:38 - INFO - __main__ - Step: 390 Loss: 10.2875 LR: 0.000300 +12/07/2025 12:33:56 - INFO - __main__ - Step: 400 Loss: 10.3194 LR: 0.000300 +12/07/2025 12:35:17 - INFO - __main__ - Step: 410 Loss: 10.3388 LR: 0.000300 +12/07/2025 12:36:16 - INFO - __main__ - Step: 420 Loss: 10.3163 LR: 0.000300 +12/07/2025 12:37:39 - INFO - __main__ - Step: 430 Loss: 10.2540 LR: 0.000300 +12/07/2025 12:38:46 - INFO - __main__ - Step: 440 Loss: 10.2222 LR: 0.000300 +12/07/2025 12:39:58 - INFO - __main__ - Step: 450 Loss: 10.1941 LR: 0.000300 +12/07/2025 12:41:19 - INFO - __main__ - Step: 460 Loss: 10.2500 LR: 0.000300 +12/07/2025 12:42:38 - INFO - __main__ - Step: 470 Loss: 10.2962 LR: 0.000300 +12/07/2025 12:43:46 - INFO - __main__ - Step: 480 Loss: 10.1556 LR: 0.000300 +12/07/2025 12:44:59 - INFO - __main__ - Step: 490 Loss: 10.2519 LR: 0.000300 +12/07/2025 12:46:21 - INFO - __main__ - Step: 500 Loss: 10.2185 LR: 0.000300 +12/07/2025 12:46:21 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-500 +12/07/2025 12:46:32 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-500/optimizer.bin +12/07/2025 12:46:32 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-500/scheduler.bin +12/07/2025 12:46:32 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-500/sampler.bin +12/07/2025 12:46:32 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-500/random_states_0.pkl +12/07/2025 12:46:32 - INFO - __main__ - Saved state to output/checkpoint-500 +12/07/2025 12:46:32 - INFO - __main__ - Generating videos for validation... +12/07/2025 12:46:32 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.76it/s] +12/07/2025 12:46:40 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 12:47:43 - INFO - __main__ - Step: 510 Loss: 10.2208 LR: 0.000300 +12/07/2025 12:49:05 - INFO - __main__ - Step: 520 Loss: 10.2090 LR: 0.000300 +12/07/2025 12:50:25 - INFO - __main__ - Step: 530 Loss: 10.1335 LR: 0.000300 +12/07/2025 12:51:40 - INFO - __main__ - Step: 540 Loss: 10.2288 LR: 0.000300 +12/07/2025 12:52:54 - INFO - __main__ - Step: 550 Loss: 10.2295 LR: 0.000300 +12/07/2025 12:54:25 - INFO - __main__ - Step: 560 Loss: 10.2106 LR: 0.000300 +12/07/2025 12:55:41 - INFO - __main__ - Step: 570 Loss: 10.2903 LR: 0.000300 +12/07/2025 12:56:48 - INFO - __main__ - Step: 580 Loss: 10.1587 LR: 0.000300 +12/07/2025 12:58:13 - INFO - __main__ - Step: 590 Loss: 10.2297 LR: 0.000300 +12/07/2025 12:59:17 - INFO - __main__ - Step: 600 Loss: 10.2574 LR: 0.000300 +12/07/2025 13:00:33 - INFO - __main__ - Step: 610 Loss: 10.2930 LR: 0.000300 +12/07/2025 13:01:40 - INFO - __main__ - Step: 620 Loss: 10.1698 LR: 0.000300 +12/07/2025 13:03:00 - INFO - __main__ - Step: 630 Loss: 10.2934 LR: 0.000300 +12/07/2025 13:04:17 - INFO - __main__ - Step: 640 Loss: 10.1840 LR: 0.000300 +12/07/2025 13:05:34 - INFO - __main__ - Step: 650 Loss: 10.2603 LR: 0.000300 +12/07/2025 13:06:43 - INFO - __main__ - Step: 660 Loss: 10.1738 LR: 0.000300 +12/07/2025 13:07:57 - INFO - __main__ - Step: 670 Loss: 10.2186 LR: 0.000300 +12/07/2025 13:09:03 - INFO - __main__ - Step: 680 Loss: 10.2047 LR: 0.000300 +12/07/2025 13:10:12 - INFO - __main__ - Step: 690 Loss: 10.1988 LR: 0.000300 +12/07/2025 13:11:33 - INFO - __main__ - Step: 700 Loss: 10.2196 LR: 0.000300 +12/07/2025 13:12:48 - INFO - __main__ - Step: 710 Loss: 10.1927 LR: 0.000300 +12/07/2025 13:13:59 - INFO - __main__ - Step: 720 Loss: 10.3047 LR: 0.000300 +12/07/2025 13:15:20 - INFO - __main__ - Step: 730 Loss: 10.2294 LR: 0.000300 +12/07/2025 13:16:25 - INFO - __main__ - Step: 740 Loss: 10.2354 LR: 0.000300 +12/07/2025 13:17:39 - INFO - __main__ - Step: 750 Loss: 10.1999 LR: 0.000300 +12/07/2025 13:18:57 - INFO - __main__ - Step: 760 Loss: 10.2463 LR: 0.000300 +12/07/2025 13:20:04 - INFO - __main__ - Step: 770 Loss: 10.1668 LR: 0.000300 +12/07/2025 13:21:11 - INFO - __main__ - Step: 780 Loss: 10.1604 LR: 0.000300 +12/07/2025 13:22:28 - INFO - __main__ - Step: 790 Loss: 10.1976 LR: 0.000300 +12/07/2025 13:23:43 - INFO - __main__ - Step: 800 Loss: 10.1867 LR: 0.000300 +12/07/2025 13:24:53 - INFO - __main__ - Step: 810 Loss: 10.2898 LR: 0.000300 +12/07/2025 13:26:02 - INFO - __main__ - Step: 820 Loss: 10.1820 LR: 0.000300 +12/07/2025 13:27:21 - INFO - __main__ - Step: 830 Loss: 10.2871 LR: 0.000300 +12/07/2025 13:28:40 - INFO - __main__ - Step: 840 Loss: 10.2063 LR: 0.000300 +12/07/2025 13:29:52 - INFO - __main__ - Step: 850 Loss: 10.0304 LR: 0.000300 +12/07/2025 13:31:04 - INFO - __main__ - Step: 860 Loss: 10.2020 LR: 0.000300 +12/07/2025 13:32:08 - INFO - __main__ - Step: 870 Loss: 10.2362 LR: 0.000300 +12/07/2025 13:33:21 - INFO - __main__ - Step: 880 Loss: 10.2228 LR: 0.000300 +12/07/2025 13:34:23 - INFO - __main__ - Step: 890 Loss: 10.1728 LR: 0.000300 +12/07/2025 13:35:38 - INFO - __main__ - Step: 900 Loss: 10.2501 LR: 0.000300 +12/07/2025 13:36:58 - INFO - __main__ - Step: 910 Loss: 10.2356 LR: 0.000300 +12/07/2025 13:38:17 - INFO - __main__ - Step: 920 Loss: 10.2477 LR: 0.000300 +12/07/2025 13:39:24 - INFO - __main__ - Step: 930 Loss: 10.3031 LR: 0.000300 +12/07/2025 13:40:43 - INFO - __main__ - Step: 940 Loss: 10.2631 LR: 0.000300 +12/07/2025 13:42:01 - INFO - __main__ - Step: 950 Loss: 10.1677 LR: 0.000300 +12/07/2025 13:43:09 - INFO - __main__ - Step: 960 Loss: 10.2624 LR: 0.000300 +12/07/2025 13:44:25 - INFO - __main__ - Step: 970 Loss: 10.2238 LR: 0.000300 +12/07/2025 13:45:39 - INFO - __main__ - Step: 980 Loss: 10.2911 LR: 0.000300 +12/07/2025 13:46:54 - INFO - __main__ - Step: 990 Loss: 10.2010 LR: 0.000300 +12/07/2025 13:48:02 - INFO - __main__ - Step: 1000 Loss: 10.1471 LR: 0.000300 +12/07/2025 13:48:02 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-1000 +12/07/2025 13:48:14 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-1000/optimizer.bin +12/07/2025 13:48:14 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-1000/scheduler.bin +12/07/2025 13:48:14 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-1000/sampler.bin +12/07/2025 13:48:14 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-1000/random_states_0.pkl +12/07/2025 13:48:14 - INFO - __main__ - Saved state to output/checkpoint-1000 +12/07/2025 13:48:14 - INFO - __main__ - Generating videos for validation... +12/07/2025 13:48:14 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.14it/s] +12/07/2025 13:48:23 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 13:49:34 - INFO - __main__ - Step: 1010 Loss: 10.1695 LR: 0.000300 +12/07/2025 13:50:41 - INFO - __main__ - Step: 1020 Loss: 10.2763 LR: 0.000300 diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/requirements.txt b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..717f7761d9e058303bd87ee0483ee540c13965e1 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T11:44:26.784203Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "16", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11716184072192" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "x3sywcr56viupxidfz51blnsu7ngelyi" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-core.log b/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..3548b5105d7984edfe1b715dd02c2173144bbb13 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-07T11:44:26.852543752Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp8d6fvgww/port-3978300.txt","pid":3978300,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T11:44:26.853044467Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3978300} +{"time":"2025-12-07T11:44:26.85303971Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3978300-3978539-2452619541/socket","Net":"unix"}} +{"time":"2025-12-07T11:44:27.039184197Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T11:44:27.045429004Z","level":"INFO","msg":"handleInformInit: received","streamId":"5sh31nrg","id":"1(@)"} +{"time":"2025-12-07T11:44:27.219826556Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"5sh31nrg","id":"1(@)"} +{"time":"2025-12-07T13:51:13.313919726Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-internal.log b/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..74c4e90de3cd30090c5bf5f578bf96cad6671b11 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-07T11:44:27.045593047Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T11:44:27.219601606Z","level":"INFO","msg":"stream: created new stream","id":"5sh31nrg"} +{"time":"2025-12-07T11:44:27.219700214Z","level":"INFO","msg":"handler: started","stream_id":"5sh31nrg"} +{"time":"2025-12-07T11:44:27.219819138Z","level":"INFO","msg":"stream: started","id":"5sh31nrg"} +{"time":"2025-12-07T11:44:27.219835504Z","level":"INFO","msg":"writer: started","stream_id":"5sh31nrg"} +{"time":"2025-12-07T11:44:27.219838032Z","level":"INFO","msg":"sender: started","stream_id":"5sh31nrg"} diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug.log b/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b7764ae5520b0f1b66f68c89ad68faaa8f474bf8 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_setup.py:_flush():80] Configure stats pid to 3978300 +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug.log +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_114426-5sh31nrg/logs/debug-internal.log +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_init.py:init():841] calling init triggers +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 11:44:26,787 INFO MainThread:3978300 [wandb_init.py:init():889] starting backend +2025-12-07 11:44:27,039 INFO MainThread:3978300 [wandb_init.py:init():892] sending inform_init request +2025-12-07 11:44:27,043 INFO MainThread:3978300 [wandb_init.py:init():900] backend started and connected +2025-12-07 11:44:27,045 INFO MainThread:3978300 [wandb_init.py:init():970] updated telemetry +2025-12-07 11:44:27,049 INFO MainThread:3978300 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 11:44:27,424 INFO MainThread:3978300 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 11:44:27,537 INFO MainThread:3978300 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 11:44:27,537 INFO MainThread:3978300 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 11:44:27,537 INFO MainThread:3978300 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 11:44:27,537 INFO MainThread:3978300 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 11:44:27,540 INFO MainThread:3978300 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 11:44:27,541 INFO MainThread:3978300 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251207_114426-5sh31nrg/run-5sh31nrg.wandb b/Meissonic/wandb/run-20251207_114426-5sh31nrg/run-5sh31nrg.wandb new file mode 100644 index 0000000000000000000000000000000000000000..03655dfe59ed05cfb933df386329fbab377b3d48 --- /dev/null +++ b/Meissonic/wandb/run-20251207_114426-5sh31nrg/run-5sh31nrg.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d7a63dd61906552ded340d36b96c81cf79d7c410a86007ebd655410dc23673c +size 1900544 diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/files/output.log b/Meissonic/wandb/run-20251207_140740-wongn7bj/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5bc5ecd941739e6b255df72d9c57d37c624a916c --- /dev/null +++ b/Meissonic/wandb/run-20251207_140740-wongn7bj/files/output.log @@ -0,0 +1,25 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10173.29it/s] +12/07/2025 14:07:48 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=1, H'=16, W'=28 +12/07/2025 14:07:48 - INFO - __main__ - Theoretical dimensions: F'=0, H'=16, W'=28 +12/07/2025 14:07:48 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 14:07:49 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 14:08:04 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 14:08:04 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 14:08:04 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 14:08:06 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 14:08:06 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 14:08:07 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 14:08:15 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 14:08:15 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 14:08:15 - INFO - __main__ - Dataloader configuration: +12/07/2025 14:08:15 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/07/2025 14:08:15 - INFO - __main__ - - prefetch_factor: 2 +12/07/2025 14:08:15 - INFO - __main__ - - persistent_workers: True +12/07/2025 14:08:15 - INFO - __main__ - - pin_memory: True +12/07/2025 14:08:15 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 14:08:19 - INFO - __main__ - ***** Running training ***** +12/07/2025 14:08:19 - INFO - __main__ - Num training steps = 10000 +12/07/2025 14:08:19 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 14:08:19 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/07/2025 14:08:19 - INFO - __main__ - Gradient Accumulation steps = 4 diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/files/requirements.txt b/Meissonic/wandb/run-20251207_140740-wongn7bj/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_140740-wongn7bj/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_140740-wongn7bj/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a6e009ecbeea5065b8b46fe97d49f4b8ae3fd8 --- /dev/null +++ b/Meissonic/wandb/run-20251207_140740-wongn7bj/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T14:07:40.971224Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "1", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a dog running", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "500", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11727619481600" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "91kfqdulncvcwtj6blirwxrit7ups2mx" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-core.log b/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..45030ce887f91fd528784c9e0373324cc6301b95 --- /dev/null +++ b/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-07T14:07:41.131286203Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpd38m19kj/port-4138782.txt","pid":4138782,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T14:07:41.131752706Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":4138782} +{"time":"2025-12-07T14:07:41.131732822Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-4138782-4139047-1035025119/socket","Net":"unix"}} +{"time":"2025-12-07T14:07:41.298684753Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T14:07:41.309453234Z","level":"INFO","msg":"handleInformInit: received","streamId":"wongn7bj","id":"1(@)"} +{"time":"2025-12-07T14:07:41.485795484Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"wongn7bj","id":"1(@)"} +{"time":"2025-12-07T14:09:29.974933716Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-internal.log b/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a5a33d6d983e7cb3f389c3a8ad83516c3a221aef --- /dev/null +++ b/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-07T14:07:41.309572818Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T14:07:41.485605356Z","level":"INFO","msg":"stream: created new stream","id":"wongn7bj"} +{"time":"2025-12-07T14:07:41.485692486Z","level":"INFO","msg":"handler: started","stream_id":"wongn7bj"} +{"time":"2025-12-07T14:07:41.485786809Z","level":"INFO","msg":"stream: started","id":"wongn7bj"} +{"time":"2025-12-07T14:07:41.48579403Z","level":"INFO","msg":"writer: started","stream_id":"wongn7bj"} +{"time":"2025-12-07T14:07:41.485797868Z","level":"INFO","msg":"sender: started","stream_id":"wongn7bj"} diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug.log b/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5ab53b0104e89bea6e19a0f0bd34d4dc617041f4 --- /dev/null +++ b/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_setup.py:_flush():80] Configure stats pid to 4138782 +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug.log +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_140740-wongn7bj/logs/debug-internal.log +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_init.py:init():841] calling init triggers +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 14:07:40,975 INFO MainThread:4138782 [wandb_init.py:init():889] starting backend +2025-12-07 14:07:41,298 INFO MainThread:4138782 [wandb_init.py:init():892] sending inform_init request +2025-12-07 14:07:41,305 INFO MainThread:4138782 [wandb_init.py:init():900] backend started and connected +2025-12-07 14:07:41,307 INFO MainThread:4138782 [wandb_init.py:init():970] updated telemetry +2025-12-07 14:07:41,312 INFO MainThread:4138782 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 14:07:41,787 INFO MainThread:4138782 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 14:07:41,896 INFO MainThread:4138782 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 14:07:41,896 INFO MainThread:4138782 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 14:07:41,896 INFO MainThread:4138782 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 14:07:41,896 INFO MainThread:4138782 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 14:07:41,900 INFO MainThread:4138782 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 14:07:41,901 INFO MainThread:4138782 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a dog running'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 1, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251207_140740-wongn7bj/run-wongn7bj.wandb b/Meissonic/wandb/run-20251207_140740-wongn7bj/run-wongn7bj.wandb new file mode 100644 index 0000000000000000000000000000000000000000..069910167bcae16fafb9c1ee28f27cfd05da1f71 Binary files /dev/null and b/Meissonic/wandb/run-20251207_140740-wongn7bj/run-wongn7bj.wandb differ diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/files/output.log b/Meissonic/wandb/run-20251207_162232-5936yne3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..56675c2fbb4aa2a421fc8290f04ad39d57aaf328 --- /dev/null +++ b/Meissonic/wandb/run-20251207_162232-5936yne3/files/output.log @@ -0,0 +1,26 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7005.52it/s] +12/07/2025 16:22:40 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=4, W'=7 +12/07/2025 16:22:40 - INFO - __main__ - Theoretical dimensions: F'=1, H'=4, W'=7 +12/07/2025 16:22:40 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 16:22:40 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 16:22:56 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 16:22:56 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 16:22:56 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 16:22:58 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 16:22:58 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 16:23:00 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 16:23:07 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 16:23:07 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 16:23:07 - INFO - __main__ - Dataloader configuration: +12/07/2025 16:23:07 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/07/2025 16:23:07 - INFO - __main__ - - prefetch_factor: 2 +12/07/2025 16:23:07 - INFO - __main__ - - persistent_workers: True +12/07/2025 16:23:07 - INFO - __main__ - - pin_memory: True +12/07/2025 16:23:07 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 16:23:10 - INFO - __main__ - ***** Running training ***** +12/07/2025 16:23:10 - INFO - __main__ - Num training steps = 10000 +12/07/2025 16:23:10 - INFO - __main__ - Instantaneous batch size per device = 4 +12/07/2025 16:23:10 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/07/2025 16:23:10 - INFO - __main__ - Gradient Accumulation steps = 4 +12/07/2025 16:24:09 - INFO - __main__ - Step: 10 Loss: 11.1066 LR: 0.000300 diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/files/requirements.txt b/Meissonic/wandb/run-20251207_162232-5936yne3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_162232-5936yne3/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_162232-5936yne3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..309345dc6fef4f2fc917af296002592c1a8e6fc8 --- /dev/null +++ b/Meissonic/wandb/run-20251207_162232-5936yne3/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T16:22:32.470052Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "8", + "--video_height", + "64", + "--video_width", + "112", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11761606651904" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "qfq6wvilussvpz4gmiysfqhvi2v8cv2e" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-core.log b/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..706283a8f6571e30f3885090dce9007c100f3222 --- /dev/null +++ b/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-07T16:22:32.539501625Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp5kepa0h2/port-1294996.txt","pid":1294996,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T16:22:32.541010216Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1294996} +{"time":"2025-12-07T16:22:32.540984344Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1294996-1295256-457235504/socket","Net":"unix"}} +{"time":"2025-12-07T16:22:32.726273607Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T16:22:32.73235234Z","level":"INFO","msg":"handleInformInit: received","streamId":"5936yne3","id":"1(@)"} +{"time":"2025-12-07T16:22:32.898964642Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"5936yne3","id":"1(@)"} +{"time":"2025-12-07T16:24:23.152698092Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-internal.log b/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5fcb206ef9076624021250f242940de48b097c7f --- /dev/null +++ b/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-07T16:22:32.732458931Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T16:22:32.898753472Z","level":"INFO","msg":"stream: created new stream","id":"5936yne3"} +{"time":"2025-12-07T16:22:32.898863331Z","level":"INFO","msg":"handler: started","stream_id":"5936yne3"} +{"time":"2025-12-07T16:22:32.898954887Z","level":"INFO","msg":"stream: started","id":"5936yne3"} +{"time":"2025-12-07T16:22:32.898975933Z","level":"INFO","msg":"writer: started","stream_id":"5936yne3"} +{"time":"2025-12-07T16:22:32.898982221Z","level":"INFO","msg":"sender: started","stream_id":"5936yne3"} diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug.log b/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1c5fc98530de6fe85367f350db769e479dc251ce --- /dev/null +++ b/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-07 16:22:32,472 INFO MainThread:1294996 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_setup.py:_flush():80] Configure stats pid to 1294996 +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug.log +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_162232-5936yne3/logs/debug-internal.log +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_init.py:init():841] calling init triggers +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 16:22:32,473 INFO MainThread:1294996 [wandb_init.py:init():889] starting backend +2025-12-07 16:22:32,726 INFO MainThread:1294996 [wandb_init.py:init():892] sending inform_init request +2025-12-07 16:22:32,730 INFO MainThread:1294996 [wandb_init.py:init():900] backend started and connected +2025-12-07 16:22:32,732 INFO MainThread:1294996 [wandb_init.py:init():970] updated telemetry +2025-12-07 16:22:32,736 INFO MainThread:1294996 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 16:22:33,084 INFO MainThread:1294996 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 16:22:33,192 INFO MainThread:1294996 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 16:22:33,192 INFO MainThread:1294996 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 16:22:33,193 INFO MainThread:1294996 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 16:22:33,193 INFO MainThread:1294996 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 16:22:33,195 INFO MainThread:1294996 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 16:22:33,196 INFO MainThread:1294996 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 8, 'video_height': 64, 'video_width': 112, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251207_162232-5936yne3/run-5936yne3.wandb b/Meissonic/wandb/run-20251207_162232-5936yne3/run-5936yne3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..637397092d0db7429e397eee074c33f0e33aacac Binary files /dev/null and b/Meissonic/wandb/run-20251207_162232-5936yne3/run-5936yne3.wandb differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1000_44b777600b7402a0b1a7.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1000_44b777600b7402a0b1a7.png new file mode 100644 index 0000000000000000000000000000000000000000..43df7ce27f7a3baafa5e96960a7ed3aaf356822d Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1000_44b777600b7402a0b1a7.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1000_e210a818106f0f6db94a.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1000_e210a818106f0f6db94a.png new file mode 100644 index 0000000000000000000000000000000000000000..b3906600815d5fe97e4c179be3e57442c2adc07d Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1000_e210a818106f0f6db94a.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1500_1268bc00f1d89990da84.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1500_1268bc00f1d89990da84.png new file mode 100644 index 0000000000000000000000000000000000000000..6f2590b825e0d2ee626f55402a2e33289ffd21d4 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1500_1268bc00f1d89990da84.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1500_e5e54b1e3d82c13cef66.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1500_e5e54b1e3d82c13cef66.png new file mode 100644 index 0000000000000000000000000000000000000000..8a33a1e9d524b64cf9075f90837a9df07f33e03a Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_1500_e5e54b1e3d82c13cef66.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2000_329c1c96d37e37a43c46.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2000_329c1c96d37e37a43c46.png new file mode 100644 index 0000000000000000000000000000000000000000..05b21ea00869ce35e4e9688e58ca3c52a4857f71 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2000_329c1c96d37e37a43c46.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2000_65df961855004ce3306c.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2000_65df961855004ce3306c.png new file mode 100644 index 0000000000000000000000000000000000000000..7575ff30e7761c1f0504647a427edd78f6808038 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2000_65df961855004ce3306c.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2500_2bc19f239d85a89f62bf.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2500_2bc19f239d85a89f62bf.png new file mode 100644 index 0000000000000000000000000000000000000000..d53cac1c524a8fe1f6bd73c04b76aae0a8376105 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2500_2bc19f239d85a89f62bf.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2500_813746d01cbaaed87c39.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2500_813746d01cbaaed87c39.png new file mode 100644 index 0000000000000000000000000000000000000000..0a6d0820f88af25095c25b02944cb1acb6926410 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_2500_813746d01cbaaed87c39.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3000_345a8f7090004b57b9db.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3000_345a8f7090004b57b9db.png new file mode 100644 index 0000000000000000000000000000000000000000..5ff2123f844e6841ed73c21910d705297a3ed2ca Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3000_345a8f7090004b57b9db.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3000_482cd504885ac6ba71a3.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3000_482cd504885ac6ba71a3.png new file mode 100644 index 0000000000000000000000000000000000000000..ad320a1185b9152c005072f847ef5dde3759d884 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3000_482cd504885ac6ba71a3.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3500_5c24b2c590a7cb20d2fc.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3500_5c24b2c590a7cb20d2fc.png new file mode 100644 index 0000000000000000000000000000000000000000..eec53728d51b9e1af9dba1cfa3cf74f3cc4197e4 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3500_5c24b2c590a7cb20d2fc.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3500_cbff2daed0e7f1283161.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3500_cbff2daed0e7f1283161.png new file mode 100644 index 0000000000000000000000000000000000000000..0dbc43bd7c9cfad3ce9c483f144edbe9ee448155 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_3500_cbff2daed0e7f1283161.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4000_0bc3dd0e13a232e77fae.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4000_0bc3dd0e13a232e77fae.png new file mode 100644 index 0000000000000000000000000000000000000000..645386bf728382f950dc3f967c934418d516c394 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4000_0bc3dd0e13a232e77fae.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4000_8b32d2cf1736c6c115f3.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4000_8b32d2cf1736c6c115f3.png new file mode 100644 index 0000000000000000000000000000000000000000..39290327427cb9e67d4475e15e70243271f4bf11 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4000_8b32d2cf1736c6c115f3.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4500_c149a271850a23e77a82.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4500_c149a271850a23e77a82.png new file mode 100644 index 0000000000000000000000000000000000000000..b9efbce8b6ac114730fd09efbec04e94df2fae60 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4500_c149a271850a23e77a82.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4500_c1dc78172f1273fdfcc6.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4500_c1dc78172f1273fdfcc6.png new file mode 100644 index 0000000000000000000000000000000000000000..519d909089bd32200a0cc3044ef9f6896f8e321b Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_4500_c1dc78172f1273fdfcc6.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5000_57af26b2ec9b14885ca3.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5000_57af26b2ec9b14885ca3.png new file mode 100644 index 0000000000000000000000000000000000000000..34bb47a00e7e93745834558d742deef14b142d0e Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5000_57af26b2ec9b14885ca3.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5000_c84e2684eb70925808ac.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5000_c84e2684eb70925808ac.png new file mode 100644 index 0000000000000000000000000000000000000000..bb40930d3427ac9709e0c67322f938b62782317b Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5000_c84e2684eb70925808ac.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_500_0424d0bee70c4a00f2ac.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_500_0424d0bee70c4a00f2ac.png new file mode 100644 index 0000000000000000000000000000000000000000..7ba68bf1b1dc65b3545513285d6160b6fe30b584 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_500_0424d0bee70c4a00f2ac.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_500_f03e54c76b828cab34d7.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_500_f03e54c76b828cab34d7.png new file mode 100644 index 0000000000000000000000000000000000000000..393d5d0268f80cb06c1f8993a72edeff1cfbcea5 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_500_f03e54c76b828cab34d7.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5500_6db9b44c08cc7a6fd2a3.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5500_6db9b44c08cc7a6fd2a3.png new file mode 100644 index 0000000000000000000000000000000000000000..e873d48f858c80c621bf5e9629e0bc6e7d5baa86 Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5500_6db9b44c08cc7a6fd2a3.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5500_cfb2e62370fc7618f0d8.png b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5500_cfb2e62370fc7618f0d8.png new file mode 100644 index 0000000000000000000000000000000000000000..2d3df62c04e066bf9a2c07d0d62726d880cc58de Binary files /dev/null and b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/media/images/generated_videos_first_frame_5500_cfb2e62370fc7618f0d8.png differ diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/output.log b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a7c1627ab439f88453ef279a079a7f8e59df343b --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/output.log @@ -0,0 +1,696 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 12022.98it/s] +12/07/2025 16:24:50 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=4, W'=7 +12/07/2025 16:24:50 - INFO - __main__ - Theoretical dimensions: F'=1, H'=4, W'=7 +12/07/2025 16:24:50 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 16:24:50 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/07/2025 16:25:06 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 16:25:06 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/07/2025 16:25:06 - INFO - __main__ - Removed 4 text_embedding keys due to input dimension mismatch (pretrained: 4096, model: 768) +12/07/2025 16:25:08 - INFO - __main__ - Only text_embedding keys are missing (expected due to text_dim mismatch) +12/07/2025 16:25:08 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/07/2025 16:25:09 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/07/2025 16:25:17 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/07/2025 16:25:17 - INFO - train.dataset_utils - Using decord for video loading +12/07/2025 16:25:17 - INFO - __main__ - Dataloader configuration: +12/07/2025 16:25:17 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/07/2025 16:25:17 - INFO - __main__ - - prefetch_factor: 2 +12/07/2025 16:25:17 - INFO - __main__ - - persistent_workers: True +12/07/2025 16:25:17 - INFO - __main__ - - pin_memory: True +12/07/2025 16:25:17 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/07/2025 16:25:20 - INFO - __main__ - ***** Running training ***** +12/07/2025 16:25:20 - INFO - __main__ - Num training steps = 10000 +12/07/2025 16:25:20 - INFO - __main__ - Instantaneous batch size per device = 16 +12/07/2025 16:25:20 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/07/2025 16:25:20 - INFO - __main__ - Gradient Accumulation steps = 1 +12/07/2025 16:26:25 - INFO - __main__ - Step: 10 Loss: 11.1033 LR: 0.000300 +12/07/2025 16:27:57 - INFO - __main__ - Step: 20 Loss: 11.1041 LR: 0.000300 +12/07/2025 16:28:52 - INFO - __main__ - Step: 30 Loss: 11.1004 LR: 0.000300 +12/07/2025 16:30:05 - INFO - __main__ - Step: 40 Loss: 11.0977 LR: 0.000300 +12/07/2025 16:31:19 - INFO - __main__ - Step: 50 Loss: 11.0966 LR: 0.000300 +12/07/2025 16:33:00 - INFO - __main__ - Step: 60 Loss: 11.0999 LR: 0.000300 +12/07/2025 16:33:58 - INFO - __main__ - Step: 70 Loss: 11.0940 LR: 0.000300 +12/07/2025 16:35:07 - INFO - __main__ - Step: 80 Loss: 11.0925 LR: 0.000300 +12/07/2025 16:36:31 - INFO - __main__ - Step: 90 Loss: 11.0854 LR: 0.000300 +12/07/2025 16:37:26 - INFO - __main__ - Step: 100 Loss: 11.0795 LR: 0.000300 +12/07/2025 16:39:01 - INFO - __main__ - Step: 110 Loss: 11.0658 LR: 0.000300 +12/07/2025 16:40:12 - INFO - __main__ - Step: 120 Loss: 11.0497 LR: 0.000300 +12/07/2025 16:41:19 - INFO - __main__ - Step: 130 Loss: 11.0412 LR: 0.000300 +12/07/2025 16:42:27 - INFO - __main__ - Step: 140 Loss: 11.0038 LR: 0.000300 +12/07/2025 16:43:24 - INFO - __main__ - Step: 150 Loss: 10.9771 LR: 0.000300 +12/07/2025 16:44:59 - INFO - __main__ - Step: 160 Loss: 10.9322 LR: 0.000300 +12/07/2025 16:46:23 - INFO - __main__ - Step: 170 Loss: 10.9129 LR: 0.000300 +12/07/2025 16:47:14 - INFO - __main__ - Step: 180 Loss: 10.8578 LR: 0.000300 +12/07/2025 16:48:27 - INFO - __main__ - Step: 190 Loss: 10.8257 LR: 0.000300 +12/07/2025 16:49:57 - INFO - __main__ - Step: 200 Loss: 10.7859 LR: 0.000300 +12/07/2025 16:51:01 - INFO - __main__ - Step: 210 Loss: 10.7523 LR: 0.000300 +12/07/2025 16:52:36 - INFO - __main__ - Step: 220 Loss: 10.6983 LR: 0.000300 +12/07/2025 16:53:28 - INFO - __main__ - Step: 230 Loss: 10.6997 LR: 0.000300 +12/07/2025 16:54:38 - INFO - __main__ - Step: 240 Loss: 10.6533 LR: 0.000300 +12/07/2025 16:56:02 - INFO - __main__ - Step: 250 Loss: 10.6413 LR: 0.000300 +12/07/2025 16:57:05 - INFO - __main__ - Step: 260 Loss: 10.6185 LR: 0.000300 +12/07/2025 16:58:34 - INFO - __main__ - Step: 270 Loss: 10.5831 LR: 0.000300 +12/07/2025 16:59:25 - INFO - __main__ - Step: 280 Loss: 10.5785 LR: 0.000300 +12/07/2025 17:01:07 - INFO - __main__ - Step: 290 Loss: 10.5342 LR: 0.000300 +12/07/2025 17:02:06 - INFO - __main__ - Step: 300 Loss: 10.5041 LR: 0.000300 +12/07/2025 17:02:38 - INFO - __main__ - Step: 310 Loss: 10.4910 LR: 0.000300 +12/07/2025 17:04:09 - INFO - __main__ - Step: 320 Loss: 10.4752 LR: 0.000300 +12/07/2025 17:05:27 - INFO - __main__ - Step: 330 Loss: 10.4914 LR: 0.000300 +12/07/2025 17:06:50 - INFO - __main__ - Step: 340 Loss: 10.4597 LR: 0.000300 +12/07/2025 17:07:33 - INFO - __main__ - Step: 350 Loss: 10.4452 LR: 0.000300 +12/07/2025 17:09:06 - INFO - __main__ - Step: 360 Loss: 10.4513 LR: 0.000300 +12/07/2025 17:10:17 - INFO - __main__ - Step: 370 Loss: 10.4291 LR: 0.000300 +12/07/2025 17:10:53 - INFO - __main__ - Step: 380 Loss: 10.4130 LR: 0.000300 +12/07/2025 17:11:26 - INFO - __main__ - Step: 390 Loss: 10.4101 LR: 0.000300 +12/07/2025 17:12:00 - INFO - __main__ - Step: 400 Loss: 10.4168 LR: 0.000300 +12/07/2025 17:12:37 - INFO - __main__ - Step: 410 Loss: 10.3808 LR: 0.000300 +12/07/2025 17:13:13 - INFO - __main__ - Step: 420 Loss: 10.4199 LR: 0.000300 +12/07/2025 17:13:47 - INFO - __main__ - Step: 430 Loss: 10.3697 LR: 0.000300 +12/07/2025 17:14:23 - INFO - __main__ - Step: 440 Loss: 10.3635 LR: 0.000300 +12/07/2025 17:15:04 - INFO - __main__ - Step: 450 Loss: 10.3445 LR: 0.000300 +12/07/2025 17:15:41 - INFO - __main__ - Step: 460 Loss: 10.3415 LR: 0.000300 +12/07/2025 17:16:16 - INFO - __main__ - Step: 470 Loss: 10.3457 LR: 0.000300 +12/07/2025 17:16:52 - INFO - __main__ - Step: 480 Loss: 10.3282 LR: 0.000300 +12/07/2025 17:17:30 - INFO - __main__ - Step: 490 Loss: 10.3382 LR: 0.000300 +12/07/2025 17:18:06 - INFO - __main__ - Step: 500 Loss: 10.3192 LR: 0.000300 +12/07/2025 17:18:06 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-500 +12/07/2025 17:19:00 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-500/optimizer.bin +12/07/2025 17:19:00 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-500/scheduler.bin +12/07/2025 17:19:00 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-500/sampler.bin +12/07/2025 17:19:00 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-500/random_states_0.pkl +12/07/2025 17:19:00 - INFO - __main__ - Saved state to output/checkpoint-500 +12/07/2025 17:19:00 - INFO - __main__ - Generating videos for validation... +12/07/2025 17:19:00 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.34it/s] +12/07/2025 17:19:07 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 17:19:39 - INFO - __main__ - Step: 510 Loss: 10.3140 LR: 0.000300 +12/07/2025 17:20:18 - INFO - __main__ - Step: 520 Loss: 10.3147 LR: 0.000300 +12/07/2025 17:20:52 - INFO - __main__ - Step: 530 Loss: 10.2680 LR: 0.000300 +12/07/2025 17:21:27 - INFO - __main__ - Step: 540 Loss: 10.3456 LR: 0.000300 +12/07/2025 17:22:05 - INFO - __main__ - Step: 550 Loss: 10.2588 LR: 0.000300 +12/07/2025 17:22:42 - INFO - __main__ - Step: 560 Loss: 10.2806 LR: 0.000300 +12/07/2025 17:23:19 - INFO - __main__ - Step: 570 Loss: 10.2781 LR: 0.000300 +12/07/2025 17:23:54 - INFO - __main__ - Step: 580 Loss: 10.2838 LR: 0.000300 +12/07/2025 17:24:33 - INFO - __main__ - Step: 590 Loss: 10.2722 LR: 0.000300 +12/07/2025 17:25:13 - INFO - __main__ - Step: 600 Loss: 10.2965 LR: 0.000300 +12/07/2025 17:25:49 - INFO - __main__ - Step: 610 Loss: 10.2650 LR: 0.000300 +12/07/2025 17:26:27 - INFO - __main__ - Step: 620 Loss: 10.2084 LR: 0.000300 +12/07/2025 17:27:05 - INFO - __main__ - Step: 630 Loss: 10.2215 LR: 0.000300 +12/07/2025 17:27:44 - INFO - __main__ - Step: 640 Loss: 10.2277 LR: 0.000300 +12/07/2025 17:28:23 - INFO - __main__ - Step: 650 Loss: 10.2791 LR: 0.000300 +12/07/2025 17:29:04 - INFO - __main__ - Step: 660 Loss: 10.2563 LR: 0.000300 +12/07/2025 17:29:44 - INFO - __main__ - Step: 670 Loss: 10.2534 LR: 0.000300 +12/07/2025 17:30:24 - INFO - __main__ - Step: 680 Loss: 10.2507 LR: 0.000300 +12/07/2025 17:31:05 - INFO - __main__ - Step: 690 Loss: 10.2491 LR: 0.000300 +12/07/2025 17:31:41 - INFO - __main__ - Step: 700 Loss: 10.2853 LR: 0.000300 +12/07/2025 17:32:18 - INFO - __main__ - Step: 710 Loss: 10.2224 LR: 0.000300 +12/07/2025 17:32:56 - INFO - __main__ - Step: 720 Loss: 10.2435 LR: 0.000300 +12/07/2025 17:33:36 - INFO - __main__ - Step: 730 Loss: 10.2312 LR: 0.000300 +12/07/2025 17:34:15 - INFO - __main__ - Step: 740 Loss: 10.2566 LR: 0.000300 +12/07/2025 17:34:53 - INFO - __main__ - Step: 750 Loss: 10.2142 LR: 0.000300 +12/07/2025 17:35:35 - INFO - __main__ - Step: 760 Loss: 10.2274 LR: 0.000300 +12/07/2025 17:36:16 - INFO - __main__ - Step: 770 Loss: 10.2140 LR: 0.000300 +12/07/2025 17:36:55 - INFO - __main__ - Step: 780 Loss: 10.1752 LR: 0.000300 +12/07/2025 17:37:34 - INFO - __main__ - Step: 790 Loss: 10.2316 LR: 0.000300 +12/07/2025 17:38:14 - INFO - __main__ - Step: 800 Loss: 10.1980 LR: 0.000300 +12/07/2025 17:38:42 - INFO - __main__ - Step: 810 Loss: 10.2566 LR: 0.000300 +12/07/2025 17:39:12 - INFO - __main__ - Step: 820 Loss: 10.2184 LR: 0.000300 +12/07/2025 17:40:38 - INFO - __main__ - Step: 830 Loss: 10.2354 LR: 0.000300 +12/07/2025 17:41:30 - INFO - __main__ - Step: 840 Loss: 10.1956 LR: 0.000300 +12/07/2025 17:42:08 - INFO - __main__ - Step: 850 Loss: 10.2070 LR: 0.000300 +12/07/2025 17:42:51 - INFO - __main__ - Step: 860 Loss: 10.2260 LR: 0.000300 +12/07/2025 17:43:25 - INFO - __main__ - Step: 870 Loss: 10.2142 LR: 0.000300 +12/07/2025 17:44:20 - INFO - __main__ - Step: 880 Loss: 10.2021 LR: 0.000300 +12/07/2025 17:44:48 - INFO - __main__ - Step: 890 Loss: 10.2127 LR: 0.000300 +12/07/2025 17:45:26 - INFO - __main__ - Step: 900 Loss: 10.2266 LR: 0.000300 +12/07/2025 17:46:28 - INFO - __main__ - Step: 910 Loss: 10.2149 LR: 0.000300 +12/07/2025 17:47:11 - INFO - __main__ - Step: 920 Loss: 10.1895 LR: 0.000300 +12/07/2025 17:47:47 - INFO - __main__ - Step: 930 Loss: 10.1790 LR: 0.000300 +12/07/2025 17:48:48 - INFO - __main__ - Step: 940 Loss: 10.2286 LR: 0.000300 +12/07/2025 17:49:33 - INFO - __main__ - Step: 950 Loss: 10.1796 LR: 0.000300 +12/07/2025 17:50:13 - INFO - __main__ - Step: 960 Loss: 10.1974 LR: 0.000300 +12/07/2025 17:50:44 - INFO - __main__ - Step: 970 Loss: 10.1427 LR: 0.000300 +12/07/2025 17:51:54 - INFO - __main__ - Step: 980 Loss: 10.2309 LR: 0.000300 +12/07/2025 17:52:21 - INFO - __main__ - Step: 990 Loss: 10.2010 LR: 0.000300 +12/07/2025 17:52:55 - INFO - __main__ - Step: 1000 Loss: 10.2001 LR: 0.000300 +12/07/2025 17:52:55 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-1000 +12/07/2025 17:54:11 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-1000/optimizer.bin +12/07/2025 17:54:11 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-1000/scheduler.bin +12/07/2025 17:54:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-1000/sampler.bin +12/07/2025 17:54:12 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-1000/random_states_0.pkl +12/07/2025 17:54:12 - INFO - __main__ - Saved state to output/checkpoint-1000 +12/07/2025 17:54:12 - INFO - __main__ - Generating videos for validation... +12/07/2025 17:54:12 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.37it/s] +12/07/2025 17:54:19 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 17:54:47 - INFO - __main__ - Step: 1010 Loss: 10.2005 LR: 0.000300 +12/07/2025 17:55:10 - INFO - __main__ - Step: 1020 Loss: 10.1823 LR: 0.000300 +12/07/2025 17:56:56 - INFO - __main__ - Step: 1030 Loss: 10.1934 LR: 0.000300 +12/07/2025 17:57:54 - INFO - __main__ - Step: 1040 Loss: 10.1914 LR: 0.000300 +12/07/2025 17:59:10 - INFO - __main__ - Step: 1050 Loss: 10.2207 LR: 0.000300 +12/07/2025 18:00:23 - INFO - __main__ - Step: 1060 Loss: 10.1580 LR: 0.000300 +12/07/2025 18:01:19 - INFO - __main__ - Step: 1070 Loss: 10.1582 LR: 0.000300 +12/07/2025 18:03:12 - INFO - __main__ - Step: 1080 Loss: 10.1831 LR: 0.000300 +12/07/2025 18:04:11 - INFO - __main__ - Step: 1090 Loss: 10.1912 LR: 0.000300 +12/07/2025 18:05:02 - INFO - __main__ - Step: 1100 Loss: 10.1714 LR: 0.000300 +12/07/2025 18:06:31 - INFO - __main__ - Step: 1110 Loss: 10.1630 LR: 0.000300 +12/07/2025 18:08:05 - INFO - __main__ - Step: 1120 Loss: 10.1448 LR: 0.000300 +12/07/2025 18:08:55 - INFO - __main__ - Step: 1130 Loss: 10.1923 LR: 0.000300 +12/07/2025 18:09:47 - INFO - __main__ - Step: 1140 Loss: 10.1450 LR: 0.000300 +12/07/2025 18:11:00 - INFO - __main__ - Step: 1150 Loss: 10.1807 LR: 0.000300 +12/07/2025 18:12:21 - INFO - __main__ - Step: 1160 Loss: 10.1212 LR: 0.000300 +12/07/2025 18:13:35 - INFO - __main__ - Step: 1170 Loss: 10.1709 LR: 0.000300 +12/07/2025 18:14:56 - INFO - __main__ - Step: 1180 Loss: 10.1850 LR: 0.000300 +12/07/2025 18:16:14 - INFO - __main__ - Step: 1190 Loss: 10.1948 LR: 0.000300 +12/07/2025 18:16:43 - INFO - __main__ - Step: 1200 Loss: 10.2182 LR: 0.000300 +12/07/2025 18:18:13 - INFO - __main__ - Step: 1210 Loss: 10.1537 LR: 0.000300 +12/07/2025 18:19:33 - INFO - __main__ - Step: 1220 Loss: 10.1617 LR: 0.000300 +12/07/2025 18:20:39 - INFO - __main__ - Step: 1230 Loss: 10.1628 LR: 0.000300 +12/07/2025 18:21:53 - INFO - __main__ - Step: 1240 Loss: 10.1704 LR: 0.000300 +12/07/2025 18:23:26 - INFO - __main__ - Step: 1250 Loss: 10.1567 LR: 0.000300 +12/07/2025 18:24:31 - INFO - __main__ - Step: 1260 Loss: 10.2149 LR: 0.000300 +12/07/2025 18:25:17 - INFO - __main__ - Step: 1270 Loss: 10.1728 LR: 0.000300 +12/07/2025 18:27:05 - INFO - __main__ - Step: 1280 Loss: 10.1604 LR: 0.000300 +12/07/2025 18:27:58 - INFO - __main__ - Step: 1290 Loss: 10.1777 LR: 0.000300 +12/07/2025 18:29:03 - INFO - __main__ - Step: 1300 Loss: 10.1720 LR: 0.000300 +12/07/2025 18:30:18 - INFO - __main__ - Step: 1310 Loss: 10.1639 LR: 0.000300 +12/07/2025 18:32:01 - INFO - __main__ - Step: 1320 Loss: 10.1699 LR: 0.000300 +12/07/2025 18:32:59 - INFO - __main__ - Step: 1330 Loss: 10.1357 LR: 0.000300 +12/07/2025 18:33:36 - INFO - __main__ - Step: 1340 Loss: 10.1204 LR: 0.000300 +12/07/2025 18:34:44 - INFO - __main__ - Step: 1350 Loss: 10.1452 LR: 0.000300 +12/07/2025 18:36:08 - INFO - __main__ - Step: 1360 Loss: 10.2035 LR: 0.000300 +12/07/2025 18:37:40 - INFO - __main__ - Step: 1370 Loss: 10.1475 LR: 0.000300 +12/07/2025 18:38:21 - INFO - __main__ - Step: 1380 Loss: 10.1088 LR: 0.000300 +12/07/2025 18:39:51 - INFO - __main__ - Step: 1390 Loss: 10.1138 LR: 0.000300 +12/07/2025 18:41:01 - INFO - __main__ - Step: 1400 Loss: 10.1653 LR: 0.000300 +12/07/2025 18:42:18 - INFO - __main__ - Step: 1410 Loss: 10.1740 LR: 0.000300 +12/07/2025 18:43:34 - INFO - __main__ - Step: 1420 Loss: 10.1698 LR: 0.000300 +12/07/2025 18:44:45 - INFO - __main__ - Step: 1430 Loss: 10.1413 LR: 0.000300 +12/07/2025 18:46:12 - INFO - __main__ - Step: 1440 Loss: 10.1127 LR: 0.000300 +12/07/2025 18:47:09 - INFO - __main__ - Step: 1450 Loss: 10.1284 LR: 0.000300 +12/07/2025 18:47:46 - INFO - __main__ - Step: 1460 Loss: 10.1595 LR: 0.000300 +12/07/2025 18:49:30 - INFO - __main__ - Step: 1470 Loss: 10.1903 LR: 0.000300 +12/07/2025 18:50:38 - INFO - __main__ - Step: 1480 Loss: 10.1181 LR: 0.000300 +12/07/2025 18:51:42 - INFO - __main__ - Step: 1490 Loss: 10.1721 LR: 0.000300 +12/07/2025 18:52:39 - INFO - __main__ - Step: 1500 Loss: 10.1124 LR: 0.000300 +12/07/2025 18:52:39 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-1500 +12/07/2025 18:52:49 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-1500/optimizer.bin +12/07/2025 18:52:49 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-1500/scheduler.bin +12/07/2025 18:52:49 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-1500/sampler.bin +12/07/2025 18:52:49 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-1500/random_states_0.pkl +12/07/2025 18:52:49 - INFO - __main__ - Saved state to output/checkpoint-1500 +12/07/2025 18:52:49 - INFO - __main__ - Generating videos for validation... +12/07/2025 18:52:49 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.73it/s] +12/07/2025 18:52:55 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 18:54:36 - INFO - __main__ - Step: 1510 Loss: 10.1322 LR: 0.000300 +12/07/2025 18:55:40 - INFO - __main__ - Step: 1520 Loss: 10.1513 LR: 0.000300 +12/07/2025 18:56:36 - INFO - __main__ - Step: 1530 Loss: 10.1439 LR: 0.000300 +12/07/2025 18:57:48 - INFO - __main__ - Step: 1540 Loss: 10.1751 LR: 0.000300 +12/07/2025 18:59:26 - INFO - __main__ - Step: 1550 Loss: 10.1061 LR: 0.000300 +12/07/2025 19:00:33 - INFO - __main__ - Step: 1560 Loss: 10.1631 LR: 0.000300 +12/07/2025 19:01:44 - INFO - __main__ - Step: 1570 Loss: 10.1543 LR: 0.000300 +12/07/2025 19:02:39 - INFO - __main__ - Step: 1580 Loss: 10.1554 LR: 0.000300 +12/07/2025 19:04:37 - INFO - __main__ - Step: 1590 Loss: 10.1294 LR: 0.000300 +12/07/2025 19:05:43 - INFO - __main__ - Step: 1600 Loss: 10.1904 LR: 0.000300 +12/07/2025 19:06:47 - INFO - __main__ - Step: 1610 Loss: 10.1552 LR: 0.000300 +12/07/2025 19:08:21 - INFO - __main__ - Step: 1620 Loss: 10.1177 LR: 0.000300 +12/07/2025 19:09:30 - INFO - __main__ - Step: 1630 Loss: 10.1860 LR: 0.000300 +12/07/2025 19:10:27 - INFO - __main__ - Step: 1640 Loss: 10.1420 LR: 0.000300 +12/07/2025 19:11:57 - INFO - __main__ - Step: 1650 Loss: 10.1693 LR: 0.000300 +12/07/2025 19:12:35 - INFO - __main__ - Step: 1660 Loss: 10.1729 LR: 0.000300 +12/07/2025 19:14:32 - INFO - __main__ - Step: 1670 Loss: 10.1426 LR: 0.000300 +12/07/2025 19:15:42 - INFO - __main__ - Step: 1680 Loss: 10.1411 LR: 0.000300 +12/07/2025 19:16:28 - INFO - __main__ - Step: 1690 Loss: 10.1301 LR: 0.000300 +12/07/2025 19:17:29 - INFO - __main__ - Step: 1700 Loss: 10.1119 LR: 0.000300 +12/07/2025 19:18:41 - INFO - __main__ - Step: 1710 Loss: 10.1723 LR: 0.000300 +12/07/2025 19:20:11 - INFO - __main__ - Step: 1720 Loss: 10.1459 LR: 0.000300 +12/07/2025 19:21:15 - INFO - __main__ - Step: 1730 Loss: 10.1574 LR: 0.000300 +12/07/2025 19:22:25 - INFO - __main__ - Step: 1740 Loss: 10.1167 LR: 0.000300 +12/07/2025 19:23:17 - INFO - __main__ - Step: 1750 Loss: 10.1751 LR: 0.000300 +12/07/2025 19:25:12 - INFO - __main__ - Step: 1760 Loss: 10.1877 LR: 0.000300 +12/07/2025 19:26:17 - INFO - __main__ - Step: 1770 Loss: 10.1407 LR: 0.000300 +12/07/2025 19:26:40 - INFO - __main__ - Step: 1780 Loss: 10.1484 LR: 0.000300 +12/07/2025 19:28:01 - INFO - __main__ - Step: 1790 Loss: 10.1138 LR: 0.000300 +12/07/2025 19:29:13 - INFO - __main__ - Step: 1800 Loss: 10.1661 LR: 0.000300 +12/07/2025 19:30:35 - INFO - __main__ - Step: 1810 Loss: 10.1577 LR: 0.000300 +12/07/2025 19:31:52 - INFO - __main__ - Step: 1820 Loss: 10.1003 LR: 0.000300 +12/07/2025 19:32:57 - INFO - __main__ - Step: 1830 Loss: 10.1812 LR: 0.000300 +12/07/2025 19:34:07 - INFO - __main__ - Step: 1840 Loss: 10.1159 LR: 0.000300 +12/07/2025 19:34:57 - INFO - __main__ - Step: 1850 Loss: 10.1697 LR: 0.000300 +12/07/2025 19:36:39 - INFO - __main__ - Step: 1860 Loss: 10.1536 LR: 0.000300 +12/07/2025 19:37:40 - INFO - __main__ - Step: 1870 Loss: 10.1706 LR: 0.000300 +12/07/2025 19:38:44 - INFO - __main__ - Step: 1880 Loss: 10.1475 LR: 0.000300 +12/07/2025 19:40:01 - INFO - __main__ - Step: 1890 Loss: 10.1564 LR: 0.000300 +12/07/2025 19:41:45 - INFO - __main__ - Step: 1900 Loss: 10.1808 LR: 0.000300 +12/07/2025 19:42:19 - INFO - __main__ - Step: 1910 Loss: 10.1397 LR: 0.000300 +12/07/2025 19:43:32 - INFO - __main__ - Step: 1920 Loss: 10.1713 LR: 0.000300 +12/07/2025 19:44:27 - INFO - __main__ - Step: 1930 Loss: 10.1934 LR: 0.000300 +12/07/2025 19:45:55 - INFO - __main__ - Step: 1940 Loss: 10.1495 LR: 0.000300 +12/07/2025 19:47:27 - INFO - __main__ - Step: 1950 Loss: 10.1704 LR: 0.000300 +12/07/2025 19:48:41 - INFO - __main__ - Step: 1960 Loss: 10.1007 LR: 0.000300 +12/07/2025 19:49:22 - INFO - __main__ - Step: 1970 Loss: 10.1508 LR: 0.000300 +12/07/2025 19:50:30 - INFO - __main__ - Step: 1980 Loss: 10.1088 LR: 0.000300 +12/07/2025 19:51:50 - INFO - __main__ - Step: 1990 Loss: 10.1007 LR: 0.000300 +12/07/2025 19:53:39 - INFO - __main__ - Step: 2000 Loss: 10.1633 LR: 0.000300 +12/07/2025 19:53:39 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-2000 +12/07/2025 19:53:50 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-2000/optimizer.bin +12/07/2025 19:53:50 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-2000/scheduler.bin +12/07/2025 19:53:50 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-2000/sampler.bin +12/07/2025 19:53:50 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-2000/random_states_0.pkl +12/07/2025 19:53:50 - INFO - __main__ - Saved state to output/checkpoint-2000 +12/07/2025 19:53:50 - INFO - __main__ - Generating videos for validation... +12/07/2025 19:53:50 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.91it/s] +12/07/2025 19:53:57 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 19:54:18 - INFO - __main__ - Step: 2010 Loss: 10.1594 LR: 0.000300 +12/07/2025 19:55:26 - INFO - __main__ - Step: 2020 Loss: 10.1396 LR: 0.000300 +12/07/2025 19:57:41 - INFO - __main__ - Step: 2030 Loss: 10.1412 LR: 0.000300 +12/07/2025 19:58:37 - INFO - __main__ - Step: 2040 Loss: 10.1486 LR: 0.000300 +12/07/2025 19:59:36 - INFO - __main__ - Step: 2050 Loss: 10.1221 LR: 0.000300 +12/07/2025 20:00:56 - INFO - __main__ - Step: 2060 Loss: 10.1243 LR: 0.000300 +12/07/2025 20:02:24 - INFO - __main__ - Step: 2070 Loss: 10.1469 LR: 0.000300 +12/07/2025 20:02:56 - INFO - __main__ - Step: 2080 Loss: 10.1595 LR: 0.000300 +12/07/2025 20:04:35 - INFO - __main__ - Step: 2090 Loss: 10.1464 LR: 0.000300 +12/07/2025 20:05:50 - INFO - __main__ - Step: 2100 Loss: 10.1028 LR: 0.000300 +12/07/2025 20:07:03 - INFO - __main__ - Step: 2110 Loss: 10.1416 LR: 0.000300 +12/07/2025 20:08:40 - INFO - __main__ - Step: 2120 Loss: 10.1180 LR: 0.000300 +12/07/2025 20:09:41 - INFO - __main__ - Step: 2130 Loss: 10.0951 LR: 0.000300 +12/07/2025 20:10:40 - INFO - __main__ - Step: 2140 Loss: 10.1197 LR: 0.000300 +12/07/2025 20:12:17 - INFO - __main__ - Step: 2150 Loss: 10.1104 LR: 0.000300 +12/07/2025 20:13:12 - INFO - __main__ - Step: 2160 Loss: 10.1151 LR: 0.000300 +12/07/2025 20:14:45 - INFO - __main__ - Step: 2170 Loss: 10.1648 LR: 0.000300 +12/07/2025 20:16:20 - INFO - __main__ - Step: 2180 Loss: 10.0615 LR: 0.000300 +12/07/2025 20:17:05 - INFO - __main__ - Step: 2190 Loss: 10.1532 LR: 0.000300 +12/07/2025 20:18:20 - INFO - __main__ - Step: 2200 Loss: 10.1731 LR: 0.000300 +12/07/2025 20:19:20 - INFO - __main__ - Step: 2210 Loss: 10.0971 LR: 0.000300 +12/07/2025 20:20:27 - INFO - __main__ - Step: 2220 Loss: 10.1072 LR: 0.000300 +12/07/2025 20:22:08 - INFO - __main__ - Step: 2230 Loss: 10.0933 LR: 0.000300 +12/07/2025 20:23:16 - INFO - __main__ - Step: 2240 Loss: 10.1418 LR: 0.000300 +12/07/2025 20:24:18 - INFO - __main__ - Step: 2250 Loss: 10.0837 LR: 0.000300 +12/07/2025 20:25:28 - INFO - __main__ - Step: 2260 Loss: 10.1261 LR: 0.000300 +12/07/2025 20:26:54 - INFO - __main__ - Step: 2270 Loss: 10.0933 LR: 0.000300 +12/07/2025 20:28:04 - INFO - __main__ - Step: 2280 Loss: 10.1105 LR: 0.000300 +12/07/2025 20:29:20 - INFO - __main__ - Step: 2290 Loss: 10.1567 LR: 0.000300 +12/07/2025 20:30:39 - INFO - __main__ - Step: 2300 Loss: 10.1307 LR: 0.000300 +12/07/2025 20:31:35 - INFO - __main__ - Step: 2310 Loss: 10.1378 LR: 0.000300 +12/07/2025 20:32:43 - INFO - __main__ - Step: 2320 Loss: 10.1715 LR: 0.000300 +12/07/2025 20:34:09 - INFO - __main__ - Step: 2330 Loss: 10.1359 LR: 0.000300 +12/07/2025 20:34:59 - INFO - __main__ - Step: 2340 Loss: 10.0916 LR: 0.000300 +12/07/2025 20:36:10 - INFO - __main__ - Step: 2350 Loss: 10.0844 LR: 0.000300 +12/07/2025 20:37:43 - INFO - __main__ - Step: 2360 Loss: 10.1179 LR: 0.000300 +12/07/2025 20:38:30 - INFO - __main__ - Step: 2370 Loss: 10.1486 LR: 0.000300 +12/07/2025 20:39:45 - INFO - __main__ - Step: 2380 Loss: 10.1077 LR: 0.000300 +12/07/2025 20:40:50 - INFO - __main__ - Step: 2390 Loss: 10.1295 LR: 0.000300 +12/07/2025 20:42:41 - INFO - __main__ - Step: 2400 Loss: 10.1412 LR: 0.000300 +12/07/2025 20:43:34 - INFO - __main__ - Step: 2410 Loss: 10.1530 LR: 0.000300 +12/07/2025 20:44:59 - INFO - __main__ - Step: 2420 Loss: 10.1040 LR: 0.000300 +12/07/2025 20:45:52 - INFO - __main__ - Step: 2430 Loss: 10.1274 LR: 0.000300 +12/07/2025 20:46:58 - INFO - __main__ - Step: 2440 Loss: 10.1884 LR: 0.000300 +12/07/2025 20:48:45 - INFO - __main__ - Step: 2450 Loss: 10.1346 LR: 0.000300 +12/07/2025 20:49:55 - INFO - __main__ - Step: 2460 Loss: 10.1678 LR: 0.000300 +12/07/2025 20:50:29 - INFO - __main__ - Step: 2470 Loss: 10.1382 LR: 0.000300 +12/07/2025 20:51:47 - INFO - __main__ - Step: 2480 Loss: 10.1151 LR: 0.000300 +12/07/2025 20:53:08 - INFO - __main__ - Step: 2490 Loss: 10.1758 LR: 0.000300 +12/07/2025 20:54:04 - INFO - __main__ - Step: 2500 Loss: 10.1317 LR: 0.000300 +12/07/2025 20:54:04 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-2500 +12/07/2025 20:54:15 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-2500/optimizer.bin +12/07/2025 20:54:15 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-2500/scheduler.bin +12/07/2025 20:54:15 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-2500/sampler.bin +12/07/2025 20:54:15 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-2500/random_states_0.pkl +12/07/2025 20:54:15 - INFO - __main__ - Saved state to output/checkpoint-2500 +12/07/2025 20:54:15 - INFO - __main__ - Generating videos for validation... +12/07/2025 20:54:15 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.67it/s] +12/07/2025 20:54:22 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 20:55:35 - INFO - __main__ - Step: 2510 Loss: 10.1393 LR: 0.000300 +12/07/2025 20:56:57 - INFO - __main__ - Step: 2520 Loss: 10.0643 LR: 0.000300 +12/07/2025 20:58:08 - INFO - __main__ - Step: 2530 Loss: 10.1168 LR: 0.000300 +12/07/2025 20:59:23 - INFO - __main__ - Step: 2540 Loss: 10.1189 LR: 0.000300 +12/07/2025 21:00:39 - INFO - __main__ - Step: 2550 Loss: 10.1117 LR: 0.000300 +12/07/2025 21:02:10 - INFO - __main__ - Step: 2560 Loss: 10.1618 LR: 0.000300 +12/07/2025 21:03:20 - INFO - __main__ - Step: 2570 Loss: 10.1795 LR: 0.000300 +12/07/2025 21:04:12 - INFO - __main__ - Step: 2580 Loss: 10.1400 LR: 0.000300 +12/07/2025 21:05:44 - INFO - __main__ - Step: 2590 Loss: 10.0959 LR: 0.000300 +12/07/2025 21:07:06 - INFO - __main__ - Step: 2600 Loss: 10.1251 LR: 0.000300 +12/07/2025 21:08:14 - INFO - __main__ - Step: 2610 Loss: 10.1444 LR: 0.000300 +12/07/2025 21:09:08 - INFO - __main__ - Step: 2620 Loss: 10.1355 LR: 0.000300 +12/07/2025 21:10:30 - INFO - __main__ - Step: 2630 Loss: 10.1343 LR: 0.000300 +12/07/2025 21:12:05 - INFO - __main__ - Step: 2640 Loss: 10.1316 LR: 0.000300 +12/07/2025 21:12:32 - INFO - __main__ - Step: 2650 Loss: 10.1624 LR: 0.000300 +12/07/2025 21:14:19 - INFO - __main__ - Step: 2660 Loss: 10.1042 LR: 0.000300 +12/07/2025 21:15:37 - INFO - __main__ - Step: 2670 Loss: 10.1426 LR: 0.000300 +12/07/2025 21:16:55 - INFO - __main__ - Step: 2680 Loss: 10.0716 LR: 0.000300 +12/07/2025 21:18:00 - INFO - __main__ - Step: 2690 Loss: 10.1285 LR: 0.000300 +12/07/2025 21:19:18 - INFO - __main__ - Step: 2700 Loss: 10.0942 LR: 0.000300 +12/07/2025 21:20:12 - INFO - __main__ - Step: 2710 Loss: 10.0839 LR: 0.000300 +12/07/2025 21:21:30 - INFO - __main__ - Step: 2720 Loss: 10.1101 LR: 0.000300 +12/07/2025 21:22:34 - INFO - __main__ - Step: 2730 Loss: 10.1478 LR: 0.000300 +12/07/2025 21:24:12 - INFO - __main__ - Step: 2740 Loss: 10.1602 LR: 0.000300 +12/07/2025 21:25:06 - INFO - __main__ - Step: 2750 Loss: 10.0788 LR: 0.000300 +12/07/2025 21:25:59 - INFO - __main__ - Step: 2760 Loss: 10.1927 LR: 0.000300 +12/07/2025 21:27:28 - INFO - __main__ - Step: 2770 Loss: 10.1464 LR: 0.000300 +12/07/2025 21:28:52 - INFO - __main__ - Step: 2780 Loss: 10.0615 LR: 0.000300 +12/07/2025 21:30:04 - INFO - __main__ - Step: 2790 Loss: 10.1120 LR: 0.000300 +12/07/2025 21:30:50 - INFO - __main__ - Step: 2800 Loss: 10.1104 LR: 0.000300 +12/07/2025 21:32:23 - INFO - __main__ - Step: 2810 Loss: 10.1261 LR: 0.000300 +12/07/2025 21:32:56 - INFO - __main__ - Step: 2820 Loss: 10.1971 LR: 0.000300 +12/07/2025 21:34:23 - INFO - __main__ - Step: 2830 Loss: 10.1971 LR: 0.000300 +12/07/2025 21:35:42 - INFO - __main__ - Step: 2840 Loss: 10.1517 LR: 0.000300 +12/07/2025 21:36:58 - INFO - __main__ - Step: 2850 Loss: 10.1552 LR: 0.000300 +12/07/2025 21:37:38 - INFO - __main__ - Step: 2860 Loss: 10.1134 LR: 0.000300 +12/07/2025 21:39:14 - INFO - __main__ - Step: 2870 Loss: 10.1486 LR: 0.000300 +12/07/2025 21:40:47 - INFO - __main__ - Step: 2880 Loss: 10.1272 LR: 0.000300 +12/07/2025 21:41:59 - INFO - __main__ - Step: 2890 Loss: 10.1501 LR: 0.000300 +12/07/2025 21:42:51 - INFO - __main__ - Step: 2900 Loss: 10.0966 LR: 0.000300 +12/07/2025 21:44:07 - INFO - __main__ - Step: 2910 Loss: 10.1257 LR: 0.000300 +12/07/2025 21:45:35 - INFO - __main__ - Step: 2920 Loss: 10.0896 LR: 0.000300 +12/07/2025 21:46:37 - INFO - __main__ - Step: 2930 Loss: 10.1127 LR: 0.000300 +12/07/2025 21:47:36 - INFO - __main__ - Step: 2940 Loss: 10.1499 LR: 0.000300 +12/07/2025 21:48:49 - INFO - __main__ - Step: 2950 Loss: 10.0881 LR: 0.000300 +12/07/2025 21:50:04 - INFO - __main__ - Step: 2960 Loss: 10.1073 LR: 0.000300 +12/07/2025 21:51:17 - INFO - __main__ - Step: 2970 Loss: 10.1481 LR: 0.000300 +12/07/2025 21:52:28 - INFO - __main__ - Step: 2980 Loss: 10.1436 LR: 0.000300 +12/07/2025 21:53:45 - INFO - __main__ - Step: 2990 Loss: 10.1366 LR: 0.000300 +12/07/2025 21:54:50 - INFO - __main__ - Step: 3000 Loss: 10.0908 LR: 0.000300 +12/07/2025 21:54:50 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-3000 +12/07/2025 21:55:02 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-3000/optimizer.bin +12/07/2025 21:55:02 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-3000/scheduler.bin +12/07/2025 21:55:02 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-3000/sampler.bin +12/07/2025 21:55:02 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-3000/random_states_0.pkl +12/07/2025 21:55:02 - INFO - __main__ - Saved state to output/checkpoint-3000 +12/07/2025 21:55:02 - INFO - __main__ - Generating videos for validation... +12/07/2025 21:55:02 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.02it/s] +12/07/2025 21:55:08 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 21:55:58 - INFO - __main__ - Step: 3010 Loss: 10.1158 LR: 0.000300 +12/07/2025 21:57:31 - INFO - __main__ - Step: 3020 Loss: 10.0939 LR: 0.000300 +12/07/2025 21:58:50 - INFO - __main__ - Step: 3030 Loss: 10.1003 LR: 0.000300 +12/07/2025 22:00:07 - INFO - __main__ - Step: 3040 Loss: 10.1083 LR: 0.000300 +12/07/2025 22:01:27 - INFO - __main__ - Step: 3050 Loss: 10.1178 LR: 0.000300 +12/07/2025 22:02:47 - INFO - __main__ - Step: 3060 Loss: 10.1260 LR: 0.000300 +12/07/2025 22:04:07 - INFO - __main__ - Step: 3070 Loss: 10.1390 LR: 0.000300 +12/07/2025 22:04:54 - INFO - __main__ - Step: 3080 Loss: 10.0849 LR: 0.000300 +12/07/2025 22:06:29 - INFO - __main__ - Step: 3090 Loss: 10.1434 LR: 0.000300 +12/07/2025 22:08:05 - INFO - __main__ - Step: 3100 Loss: 10.0905 LR: 0.000300 +12/07/2025 22:09:37 - INFO - __main__ - Step: 3110 Loss: 10.1203 LR: 0.000300 +12/07/2025 22:10:19 - INFO - __main__ - Step: 3120 Loss: 10.1354 LR: 0.000300 +12/07/2025 22:11:23 - INFO - __main__ - Step: 3130 Loss: 10.1442 LR: 0.000300 +12/07/2025 22:12:20 - INFO - __main__ - Step: 3140 Loss: 10.1268 LR: 0.000300 +12/07/2025 22:14:19 - INFO - __main__ - Step: 3150 Loss: 10.1746 LR: 0.000300 +12/07/2025 22:15:16 - INFO - __main__ - Step: 3160 Loss: 10.0942 LR: 0.000300 +12/07/2025 22:15:58 - INFO - __main__ - Step: 3170 Loss: 10.0822 LR: 0.000300 +12/07/2025 22:16:50 - INFO - __main__ - Step: 3180 Loss: 10.1388 LR: 0.000300 +12/07/2025 22:18:45 - INFO - __main__ - Step: 3190 Loss: 10.0759 LR: 0.000300 +12/07/2025 22:19:33 - INFO - __main__ - Step: 3200 Loss: 10.1417 LR: 0.000300 +12/07/2025 22:21:06 - INFO - __main__ - Step: 3210 Loss: 10.1128 LR: 0.000300 +12/07/2025 22:22:04 - INFO - __main__ - Step: 3220 Loss: 10.1362 LR: 0.000300 +12/07/2025 22:23:18 - INFO - __main__ - Step: 3230 Loss: 10.1147 LR: 0.000300 +12/07/2025 22:24:24 - INFO - __main__ - Step: 3240 Loss: 10.1031 LR: 0.000300 +12/07/2025 22:25:21 - INFO - __main__ - Step: 3250 Loss: 10.1037 LR: 0.000300 +12/07/2025 22:26:57 - INFO - __main__ - Step: 3260 Loss: 10.0595 LR: 0.000300 +12/07/2025 22:27:55 - INFO - __main__ - Step: 3270 Loss: 10.1461 LR: 0.000300 +12/07/2025 22:28:45 - INFO - __main__ - Step: 3280 Loss: 10.1491 LR: 0.000300 +12/07/2025 22:30:32 - INFO - __main__ - Step: 3290 Loss: 10.1111 LR: 0.000300 +12/07/2025 22:31:25 - INFO - __main__ - Step: 3300 Loss: 10.1625 LR: 0.000300 +12/07/2025 22:32:22 - INFO - __main__ - Step: 3310 Loss: 10.1440 LR: 0.000300 +12/07/2025 22:33:53 - INFO - __main__ - Step: 3320 Loss: 10.1418 LR: 0.000300 +12/07/2025 22:34:42 - INFO - __main__ - Step: 3330 Loss: 10.1094 LR: 0.000300 +12/07/2025 22:36:19 - INFO - __main__ - Step: 3340 Loss: 10.1402 LR: 0.000300 +12/07/2025 22:37:08 - INFO - __main__ - Step: 3350 Loss: 10.1113 LR: 0.000300 +12/07/2025 22:38:11 - INFO - __main__ - Step: 3360 Loss: 10.1488 LR: 0.000300 +12/07/2025 22:39:31 - INFO - __main__ - Step: 3370 Loss: 10.1225 LR: 0.000300 +12/07/2025 22:40:48 - INFO - __main__ - Step: 3380 Loss: 10.1273 LR: 0.000300 +12/07/2025 22:41:48 - INFO - __main__ - Step: 3390 Loss: 10.1285 LR: 0.000300 +12/07/2025 22:43:04 - INFO - __main__ - Step: 3400 Loss: 10.1547 LR: 0.000300 +12/07/2025 22:44:45 - INFO - __main__ - Step: 3410 Loss: 10.0919 LR: 0.000300 +12/07/2025 22:45:42 - INFO - __main__ - Step: 3420 Loss: 10.1012 LR: 0.000300 +12/07/2025 22:46:36 - INFO - __main__ - Step: 3430 Loss: 10.0747 LR: 0.000300 +12/07/2025 22:48:06 - INFO - __main__ - Step: 3440 Loss: 10.1586 LR: 0.000300 +12/07/2025 22:49:12 - INFO - __main__ - Step: 3450 Loss: 10.1038 LR: 0.000300 +12/07/2025 22:50:16 - INFO - __main__ - Step: 3460 Loss: 10.1393 LR: 0.000300 +12/07/2025 22:51:46 - INFO - __main__ - Step: 3470 Loss: 10.1101 LR: 0.000300 +12/07/2025 22:52:58 - INFO - __main__ - Step: 3480 Loss: 10.1199 LR: 0.000300 +12/07/2025 22:53:28 - INFO - __main__ - Step: 3490 Loss: 10.1166 LR: 0.000300 +12/07/2025 22:55:14 - INFO - __main__ - Step: 3500 Loss: 10.1411 LR: 0.000300 +12/07/2025 22:55:14 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-3500 +12/07/2025 22:55:23 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-3500/optimizer.bin +12/07/2025 22:55:23 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-3500/scheduler.bin +12/07/2025 22:55:23 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-3500/sampler.bin +12/07/2025 22:55:23 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-3500/random_states_0.pkl +12/07/2025 22:55:23 - INFO - __main__ - Saved state to output/checkpoint-3500 +12/07/2025 22:55:23 - INFO - __main__ - Generating videos for validation... +12/07/2025 22:55:23 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.93it/s] +12/07/2025 22:55:29 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 22:56:41 - INFO - __main__ - Step: 3510 Loss: 10.1214 LR: 0.000300 +12/07/2025 22:57:59 - INFO - __main__ - Step: 3520 Loss: 10.0980 LR: 0.000300 +12/07/2025 22:59:07 - INFO - __main__ - Step: 3530 Loss: 10.1004 LR: 0.000300 +12/07/2025 23:00:03 - INFO - __main__ - Step: 3540 Loss: 10.1329 LR: 0.000300 +12/07/2025 23:01:21 - INFO - __main__ - Step: 3550 Loss: 10.0678 LR: 0.000300 +12/07/2025 23:03:08 - INFO - __main__ - Step: 3560 Loss: 10.1182 LR: 0.000300 +12/07/2025 23:03:49 - INFO - __main__ - Step: 3570 Loss: 10.1269 LR: 0.000300 +12/07/2025 23:05:21 - INFO - __main__ - Step: 3580 Loss: 10.1283 LR: 0.000300 +12/07/2025 23:06:30 - INFO - __main__ - Step: 3590 Loss: 10.0866 LR: 0.000300 +12/07/2025 23:07:54 - INFO - __main__ - Step: 3600 Loss: 10.0938 LR: 0.000300 +12/07/2025 23:09:08 - INFO - __main__ - Step: 3610 Loss: 10.1321 LR: 0.000300 +12/07/2025 23:10:22 - INFO - __main__ - Step: 3620 Loss: 10.0971 LR: 0.000300 +12/07/2025 23:11:39 - INFO - __main__ - Step: 3630 Loss: 10.0944 LR: 0.000300 +12/07/2025 23:12:30 - INFO - __main__ - Step: 3640 Loss: 10.1096 LR: 0.000300 +12/07/2025 23:13:51 - INFO - __main__ - Step: 3650 Loss: 10.0610 LR: 0.000300 +12/07/2025 23:14:58 - INFO - __main__ - Step: 3660 Loss: 10.1406 LR: 0.000300 +12/07/2025 23:16:10 - INFO - __main__ - Step: 3670 Loss: 10.0925 LR: 0.000300 +12/07/2025 23:17:34 - INFO - __main__ - Step: 3680 Loss: 10.1118 LR: 0.000300 +12/07/2025 23:18:27 - INFO - __main__ - Step: 3690 Loss: 10.1341 LR: 0.000300 +12/07/2025 23:19:29 - INFO - __main__ - Step: 3700 Loss: 10.1133 LR: 0.000300 +12/07/2025 23:20:27 - INFO - __main__ - Step: 3710 Loss: 10.1019 LR: 0.000300 +12/07/2025 23:22:05 - INFO - __main__ - Step: 3720 Loss: 10.1304 LR: 0.000300 +12/07/2025 23:23:19 - INFO - __main__ - Step: 3730 Loss: 10.1350 LR: 0.000300 +12/07/2025 23:24:13 - INFO - __main__ - Step: 3740 Loss: 10.0927 LR: 0.000300 +12/07/2025 23:25:13 - INFO - __main__ - Step: 3750 Loss: 10.1549 LR: 0.000300 +12/07/2025 23:27:07 - INFO - __main__ - Step: 3760 Loss: 10.1384 LR: 0.000300 +12/07/2025 23:28:02 - INFO - __main__ - Step: 3770 Loss: 10.0838 LR: 0.000300 +12/07/2025 23:28:52 - INFO - __main__ - Step: 3780 Loss: 10.1256 LR: 0.000300 +12/07/2025 23:30:05 - INFO - __main__ - Step: 3790 Loss: 10.1051 LR: 0.000300 +12/07/2025 23:31:05 - INFO - __main__ - Step: 3800 Loss: 10.1038 LR: 0.000300 +12/07/2025 23:32:39 - INFO - __main__ - Step: 3810 Loss: 10.1181 LR: 0.000300 +12/07/2025 23:33:59 - INFO - __main__ - Step: 3820 Loss: 10.1004 LR: 0.000300 +12/07/2025 23:35:04 - INFO - __main__ - Step: 3830 Loss: 10.1324 LR: 0.000300 +12/07/2025 23:36:21 - INFO - __main__ - Step: 3840 Loss: 10.1276 LR: 0.000300 +12/07/2025 23:37:10 - INFO - __main__ - Step: 3850 Loss: 10.1421 LR: 0.000300 +12/07/2025 23:38:52 - INFO - __main__ - Step: 3860 Loss: 10.1204 LR: 0.000300 +12/07/2025 23:40:10 - INFO - __main__ - Step: 3870 Loss: 10.1103 LR: 0.000300 +12/07/2025 23:41:06 - INFO - __main__ - Step: 3880 Loss: 10.1041 LR: 0.000300 +12/07/2025 23:42:04 - INFO - __main__ - Step: 3890 Loss: 10.1313 LR: 0.000300 +12/07/2025 23:43:42 - INFO - __main__ - Step: 3900 Loss: 10.1450 LR: 0.000300 +12/07/2025 23:44:52 - INFO - __main__ - Step: 3910 Loss: 10.1357 LR: 0.000300 +12/07/2025 23:45:52 - INFO - __main__ - Step: 3920 Loss: 10.0826 LR: 0.000300 +12/07/2025 23:47:07 - INFO - __main__ - Step: 3930 Loss: 10.0945 LR: 0.000300 +12/07/2025 23:48:13 - INFO - __main__ - Step: 3940 Loss: 10.1269 LR: 0.000300 +12/07/2025 23:49:29 - INFO - __main__ - Step: 3950 Loss: 10.1226 LR: 0.000300 +12/07/2025 23:50:46 - INFO - __main__ - Step: 3960 Loss: 10.0802 LR: 0.000300 +12/07/2025 23:51:44 - INFO - __main__ - Step: 3970 Loss: 10.0990 LR: 0.000300 +12/07/2025 23:53:10 - INFO - __main__ - Step: 3980 Loss: 10.1175 LR: 0.000300 +12/07/2025 23:53:59 - INFO - __main__ - Step: 3990 Loss: 10.1342 LR: 0.000300 +12/07/2025 23:55:03 - INFO - __main__ - Step: 4000 Loss: 10.1227 LR: 0.000300 +12/07/2025 23:55:03 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-4000 +12/07/2025 23:55:15 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-4000/optimizer.bin +12/07/2025 23:55:15 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-4000/scheduler.bin +12/07/2025 23:55:15 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-4000/sampler.bin +12/07/2025 23:55:15 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-4000/random_states_0.pkl +12/07/2025 23:55:15 - INFO - __main__ - Saved state to output/checkpoint-4000 +12/07/2025 23:55:15 - INFO - __main__ - Generating videos for validation... +12/07/2025 23:55:15 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.96it/s] +12/07/2025 23:55:21 - INFO - __main__ - Validation videos saved to ./output +12/07/2025 23:56:28 - INFO - __main__ - Step: 4010 Loss: 10.1361 LR: 0.000300 +12/07/2025 23:57:36 - INFO - __main__ - Step: 4020 Loss: 10.1032 LR: 0.000300 +12/07/2025 23:59:02 - INFO - __main__ - Step: 4030 Loss: 10.1369 LR: 0.000300 +12/08/2025 00:00:26 - INFO - __main__ - Step: 4040 Loss: 10.0962 LR: 0.000300 +12/08/2025 00:01:57 - INFO - __main__ - Step: 4050 Loss: 10.1334 LR: 0.000300 +12/08/2025 00:03:49 - INFO - __main__ - Step: 4060 Loss: 10.1366 LR: 0.000300 +12/08/2025 00:04:16 - INFO - __main__ - Step: 4070 Loss: 10.1274 LR: 0.000300 +12/08/2025 00:05:22 - INFO - __main__ - Step: 4080 Loss: 10.0861 LR: 0.000300 +12/08/2025 00:07:11 - INFO - __main__ - Step: 4090 Loss: 10.1302 LR: 0.000300 +12/08/2025 00:08:18 - INFO - __main__ - Step: 4100 Loss: 10.1000 LR: 0.000300 +12/08/2025 00:09:56 - INFO - __main__ - Step: 4110 Loss: 10.0801 LR: 0.000300 +12/08/2025 00:10:43 - INFO - __main__ - Step: 4120 Loss: 10.1523 LR: 0.000300 +12/08/2025 00:11:58 - INFO - __main__ - Step: 4130 Loss: 10.1343 LR: 0.000300 +12/08/2025 00:13:08 - INFO - __main__ - Step: 4140 Loss: 10.1137 LR: 0.000300 +12/08/2025 00:15:03 - INFO - __main__ - Step: 4150 Loss: 10.1415 LR: 0.000300 +12/08/2025 00:16:01 - INFO - __main__ - Step: 4160 Loss: 10.1327 LR: 0.000300 +12/08/2025 00:16:50 - INFO - __main__ - Step: 4170 Loss: 10.1307 LR: 0.000300 +12/08/2025 00:17:56 - INFO - __main__ - Step: 4180 Loss: 10.0308 LR: 0.000300 +12/08/2025 00:19:13 - INFO - __main__ - Step: 4190 Loss: 10.1444 LR: 0.000300 +12/08/2025 00:20:30 - INFO - __main__ - Step: 4200 Loss: 10.1116 LR: 0.000300 +12/08/2025 00:21:53 - INFO - __main__ - Step: 4210 Loss: 10.0921 LR: 0.000300 +12/08/2025 00:22:40 - INFO - __main__ - Step: 4220 Loss: 10.1288 LR: 0.000300 +12/08/2025 00:24:00 - INFO - __main__ - Step: 4230 Loss: 10.0916 LR: 0.000300 +12/08/2025 00:25:00 - INFO - __main__ - Step: 4240 Loss: 10.1101 LR: 0.000300 +12/08/2025 00:26:43 - INFO - __main__ - Step: 4250 Loss: 10.1124 LR: 0.000300 +12/08/2025 00:28:05 - INFO - __main__ - Step: 4260 Loss: 10.1369 LR: 0.000300 +12/08/2025 00:29:16 - INFO - __main__ - Step: 4270 Loss: 10.1046 LR: 0.000300 +12/08/2025 00:29:47 - INFO - __main__ - Step: 4280 Loss: 10.0824 LR: 0.000300 +12/08/2025 00:31:17 - INFO - __main__ - Step: 4290 Loss: 10.1424 LR: 0.000300 +12/08/2025 00:32:50 - INFO - __main__ - Step: 4300 Loss: 10.1022 LR: 0.000300 +12/08/2025 00:33:31 - INFO - __main__ - Step: 4310 Loss: 10.1400 LR: 0.000300 +12/08/2025 00:35:06 - INFO - __main__ - Step: 4320 Loss: 10.1042 LR: 0.000300 +12/08/2025 00:36:03 - INFO - __main__ - Step: 4330 Loss: 10.1176 LR: 0.000300 +12/08/2025 00:37:19 - INFO - __main__ - Step: 4340 Loss: 10.1133 LR: 0.000300 +12/08/2025 00:38:10 - INFO - __main__ - Step: 4350 Loss: 10.1367 LR: 0.000300 +12/08/2025 00:39:35 - INFO - __main__ - Step: 4360 Loss: 10.1220 LR: 0.000300 +12/08/2025 00:41:13 - INFO - __main__ - Step: 4370 Loss: 10.1514 LR: 0.000300 +12/08/2025 00:42:38 - INFO - __main__ - Step: 4380 Loss: 10.0807 LR: 0.000300 +12/08/2025 00:43:00 - INFO - __main__ - Step: 4390 Loss: 10.1551 LR: 0.000300 +12/08/2025 00:43:56 - INFO - __main__ - Step: 4400 Loss: 10.1531 LR: 0.000300 +12/08/2025 00:45:37 - INFO - __main__ - Step: 4410 Loss: 10.1076 LR: 0.000300 +12/08/2025 00:47:04 - INFO - __main__ - Step: 4420 Loss: 10.1118 LR: 0.000300 +12/08/2025 00:47:59 - INFO - __main__ - Step: 4430 Loss: 10.1432 LR: 0.000300 +12/08/2025 00:49:25 - INFO - __main__ - Step: 4440 Loss: 10.1053 LR: 0.000300 +12/08/2025 00:50:33 - INFO - __main__ - Step: 4450 Loss: 10.0977 LR: 0.000300 +12/08/2025 00:51:30 - INFO - __main__ - Step: 4460 Loss: 10.1340 LR: 0.000300 +12/08/2025 00:52:47 - INFO - __main__ - Step: 4470 Loss: 10.0934 LR: 0.000300 +12/08/2025 00:54:08 - INFO - __main__ - Step: 4480 Loss: 10.1376 LR: 0.000300 +12/08/2025 00:55:23 - INFO - __main__ - Step: 4490 Loss: 10.1677 LR: 0.000300 +12/08/2025 00:56:29 - INFO - __main__ - Step: 4500 Loss: 10.1033 LR: 0.000300 +12/08/2025 00:56:29 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-4500 +12/08/2025 00:56:40 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-4500/optimizer.bin +12/08/2025 00:56:40 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-4500/scheduler.bin +12/08/2025 00:56:40 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-4500/sampler.bin +12/08/2025 00:56:40 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-4500/random_states_0.pkl +12/08/2025 00:56:40 - INFO - __main__ - Saved state to output/checkpoint-4500 +12/08/2025 00:56:40 - INFO - __main__ - Generating videos for validation... +12/08/2025 00:56:40 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.78it/s] +12/08/2025 00:56:47 - INFO - __main__ - Validation videos saved to ./output +12/08/2025 00:57:25 - INFO - __main__ - Step: 4510 Loss: 10.1278 LR: 0.000300 +12/08/2025 00:58:53 - INFO - __main__ - Step: 4520 Loss: 10.1052 LR: 0.000300 +12/08/2025 01:00:04 - INFO - __main__ - Step: 4530 Loss: 10.1008 LR: 0.000300 +12/08/2025 01:01:24 - INFO - __main__ - Step: 4540 Loss: 10.1600 LR: 0.000300 +12/08/2025 01:02:53 - INFO - __main__ - Step: 4550 Loss: 10.0973 LR: 0.000300 +12/08/2025 01:04:03 - INFO - __main__ - Step: 4560 Loss: 10.1170 LR: 0.000300 +12/08/2025 01:05:14 - INFO - __main__ - Step: 4570 Loss: 10.1477 LR: 0.000300 +12/08/2025 01:06:49 - INFO - __main__ - Step: 4580 Loss: 10.0890 LR: 0.000300 +12/08/2025 01:08:00 - INFO - __main__ - Step: 4590 Loss: 10.1519 LR: 0.000300 +12/08/2025 01:08:36 - INFO - __main__ - Step: 4600 Loss: 10.1119 LR: 0.000300 +12/08/2025 01:10:17 - INFO - __main__ - Step: 4610 Loss: 10.0597 LR: 0.000300 +12/08/2025 01:11:35 - INFO - __main__ - Step: 4620 Loss: 10.1061 LR: 0.000300 +12/08/2025 01:12:48 - INFO - __main__ - Step: 4630 Loss: 10.0830 LR: 0.000300 +12/08/2025 01:14:02 - INFO - __main__ - Step: 4640 Loss: 10.0708 LR: 0.000300 +12/08/2025 01:15:36 - INFO - __main__ - Step: 4650 Loss: 10.1501 LR: 0.000300 +12/08/2025 01:16:28 - INFO - __main__ - Step: 4660 Loss: 10.1455 LR: 0.000300 +12/08/2025 01:17:53 - INFO - __main__ - Step: 4670 Loss: 10.1160 LR: 0.000300 +12/08/2025 01:18:59 - INFO - __main__ - Step: 4680 Loss: 10.1091 LR: 0.000300 +12/08/2025 01:20:16 - INFO - __main__ - Step: 4690 Loss: 10.1124 LR: 0.000300 +12/08/2025 01:21:19 - INFO - __main__ - Step: 4700 Loss: 10.1044 LR: 0.000300 +12/08/2025 01:22:56 - INFO - __main__ - Step: 4710 Loss: 10.1138 LR: 0.000300 +12/08/2025 01:23:55 - INFO - __main__ - Step: 4720 Loss: 10.1304 LR: 0.000300 +12/08/2025 01:25:14 - INFO - __main__ - Step: 4730 Loss: 10.1106 LR: 0.000300 +12/08/2025 01:26:22 - INFO - __main__ - Step: 4740 Loss: 10.1246 LR: 0.000300 +12/08/2025 01:27:49 - INFO - __main__ - Step: 4750 Loss: 10.0716 LR: 0.000300 +12/08/2025 01:29:05 - INFO - __main__ - Step: 4760 Loss: 10.0866 LR: 0.000300 +12/08/2025 01:30:01 - INFO - __main__ - Step: 4770 Loss: 10.1408 LR: 0.000300 +12/08/2025 01:31:14 - INFO - __main__ - Step: 4780 Loss: 10.1482 LR: 0.000300 +12/08/2025 01:32:14 - INFO - __main__ - Step: 4790 Loss: 10.1269 LR: 0.000300 +12/08/2025 01:33:39 - INFO - __main__ - Step: 4800 Loss: 10.1337 LR: 0.000300 +12/08/2025 01:35:15 - INFO - __main__ - Step: 4810 Loss: 10.0809 LR: 0.000300 +12/08/2025 01:35:58 - INFO - __main__ - Step: 4820 Loss: 10.1140 LR: 0.000300 +12/08/2025 01:37:18 - INFO - __main__ - Step: 4830 Loss: 10.1142 LR: 0.000300 +12/08/2025 01:38:02 - INFO - __main__ - Step: 4840 Loss: 10.0793 LR: 0.000300 +12/08/2025 01:40:05 - INFO - __main__ - Step: 4850 Loss: 10.1118 LR: 0.000300 +12/08/2025 01:40:52 - INFO - __main__ - Step: 4860 Loss: 10.0945 LR: 0.000300 +12/08/2025 01:41:48 - INFO - __main__ - Step: 4870 Loss: 10.1303 LR: 0.000300 +12/08/2025 01:42:58 - INFO - __main__ - Step: 4880 Loss: 10.1377 LR: 0.000300 +12/08/2025 01:44:26 - INFO - __main__ - Step: 4890 Loss: 10.1220 LR: 0.000300 +12/08/2025 01:45:24 - INFO - __main__ - Step: 4900 Loss: 10.1149 LR: 0.000300 +12/08/2025 01:46:25 - INFO - __main__ - Step: 4910 Loss: 10.1267 LR: 0.000300 +12/08/2025 01:47:48 - INFO - __main__ - Step: 4920 Loss: 10.0730 LR: 0.000300 +12/08/2025 01:48:53 - INFO - __main__ - Step: 4930 Loss: 10.1204 LR: 0.000300 +12/08/2025 01:49:55 - INFO - __main__ - Step: 4940 Loss: 10.1248 LR: 0.000300 +12/08/2025 01:51:09 - INFO - __main__ - Step: 4950 Loss: 10.0880 LR: 0.000300 +12/08/2025 01:52:23 - INFO - __main__ - Step: 4960 Loss: 10.0721 LR: 0.000300 +12/08/2025 01:54:26 - INFO - __main__ - Step: 4970 Loss: 10.0746 LR: 0.000300 +12/08/2025 01:55:29 - INFO - __main__ - Step: 4980 Loss: 10.0852 LR: 0.000300 +12/08/2025 01:56:24 - INFO - __main__ - Step: 4990 Loss: 10.0700 LR: 0.000300 +12/08/2025 01:57:29 - INFO - __main__ - Step: 5000 Loss: 10.1090 LR: 0.000300 +12/08/2025 01:57:29 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-5000 +12/08/2025 01:57:41 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-5000/optimizer.bin +12/08/2025 01:57:41 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-5000/scheduler.bin +12/08/2025 01:57:41 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-5000/sampler.bin +12/08/2025 01:57:41 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-5000/random_states_0.pkl +12/08/2025 01:57:41 - INFO - __main__ - Saved state to output/checkpoint-5000 +12/08/2025 01:57:41 - INFO - __main__ - Generating videos for validation... +12/08/2025 01:57:41 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.06it/s] +12/08/2025 01:57:48 - INFO - __main__ - Validation videos saved to ./output +12/08/2025 01:58:26 - INFO - __main__ - Step: 5010 Loss: 10.1705 LR: 0.000300 +12/08/2025 02:00:24 - INFO - __main__ - Step: 5020 Loss: 10.1525 LR: 0.000300 +12/08/2025 02:01:16 - INFO - __main__ - Step: 5030 Loss: 10.1116 LR: 0.000300 +12/08/2025 02:02:48 - INFO - __main__ - Step: 5040 Loss: 10.0961 LR: 0.000300 +12/08/2025 02:03:39 - INFO - __main__ - Step: 5050 Loss: 10.1022 LR: 0.000300 +12/08/2025 02:05:03 - INFO - __main__ - Step: 5060 Loss: 10.1888 LR: 0.000300 +12/08/2025 02:06:14 - INFO - __main__ - Step: 5070 Loss: 10.0822 LR: 0.000300 +12/08/2025 02:07:40 - INFO - __main__ - Step: 5080 Loss: 10.1024 LR: 0.000300 +12/08/2025 02:08:44 - INFO - __main__ - Step: 5090 Loss: 10.1402 LR: 0.000300 +12/08/2025 02:10:18 - INFO - __main__ - Step: 5100 Loss: 10.0779 LR: 0.000300 +12/08/2025 02:11:30 - INFO - __main__ - Step: 5110 Loss: 10.1147 LR: 0.000300 +12/08/2025 02:12:38 - INFO - __main__ - Step: 5120 Loss: 10.1174 LR: 0.000300 +12/08/2025 02:14:18 - INFO - __main__ - Step: 5130 Loss: 10.0944 LR: 0.000300 +12/08/2025 02:15:14 - INFO - __main__ - Step: 5140 Loss: 10.1069 LR: 0.000300 +12/08/2025 02:16:20 - INFO - __main__ - Step: 5150 Loss: 10.0671 LR: 0.000300 +12/08/2025 02:17:28 - INFO - __main__ - Step: 5160 Loss: 10.1239 LR: 0.000300 +12/08/2025 02:18:43 - INFO - __main__ - Step: 5170 Loss: 10.0908 LR: 0.000300 +12/08/2025 02:19:58 - INFO - __main__ - Step: 5180 Loss: 10.0803 LR: 0.000300 +12/08/2025 02:21:01 - INFO - __main__ - Step: 5190 Loss: 10.1338 LR: 0.000300 +12/08/2025 02:22:19 - INFO - __main__ - Step: 5200 Loss: 10.1098 LR: 0.000300 +12/08/2025 02:23:44 - INFO - __main__ - Step: 5210 Loss: 10.1081 LR: 0.000300 +12/08/2025 02:24:32 - INFO - __main__ - Step: 5220 Loss: 10.0951 LR: 0.000300 +12/08/2025 02:25:51 - INFO - __main__ - Step: 5230 Loss: 10.1255 LR: 0.000300 +12/08/2025 02:27:18 - INFO - __main__ - Step: 5240 Loss: 10.1175 LR: 0.000300 +12/08/2025 02:28:08 - INFO - __main__ - Step: 5250 Loss: 10.1453 LR: 0.000300 +12/08/2025 02:30:13 - INFO - __main__ - Step: 5260 Loss: 10.1351 LR: 0.000300 +12/08/2025 02:30:44 - INFO - __main__ - Step: 5270 Loss: 10.1315 LR: 0.000300 +12/08/2025 02:31:54 - INFO - __main__ - Step: 5280 Loss: 10.1318 LR: 0.000300 +12/08/2025 02:33:12 - INFO - __main__ - Step: 5290 Loss: 10.1050 LR: 0.000300 +12/08/2025 02:34:28 - INFO - __main__ - Step: 5300 Loss: 10.0857 LR: 0.000300 +12/08/2025 02:36:01 - INFO - __main__ - Step: 5310 Loss: 10.1386 LR: 0.000300 +12/08/2025 02:36:44 - INFO - __main__ - Step: 5320 Loss: 10.1030 LR: 0.000300 +12/08/2025 02:37:50 - INFO - __main__ - Step: 5330 Loss: 10.1099 LR: 0.000300 +12/08/2025 02:39:21 - INFO - __main__ - Step: 5340 Loss: 10.0954 LR: 0.000300 +12/08/2025 02:40:51 - INFO - __main__ - Step: 5350 Loss: 10.1111 LR: 0.000300 +12/08/2025 02:41:44 - INFO - __main__ - Step: 5360 Loss: 10.1610 LR: 0.000300 +12/08/2025 02:43:07 - INFO - __main__ - Step: 5370 Loss: 10.1328 LR: 0.000300 +12/08/2025 02:44:41 - INFO - __main__ - Step: 5380 Loss: 10.1277 LR: 0.000300 +12/08/2025 02:45:39 - INFO - __main__ - Step: 5390 Loss: 10.1130 LR: 0.000300 +12/08/2025 02:47:05 - INFO - __main__ - Step: 5400 Loss: 10.1247 LR: 0.000300 +12/08/2025 02:48:01 - INFO - __main__ - Step: 5410 Loss: 10.1113 LR: 0.000300 +12/08/2025 02:49:16 - INFO - __main__ - Step: 5420 Loss: 10.1191 LR: 0.000300 +12/08/2025 02:50:25 - INFO - __main__ - Step: 5430 Loss: 10.0968 LR: 0.000300 +12/08/2025 02:51:40 - INFO - __main__ - Step: 5440 Loss: 10.1045 LR: 0.000300 +12/08/2025 02:52:39 - INFO - __main__ - Step: 5450 Loss: 10.1583 LR: 0.000300 +12/08/2025 02:54:21 - INFO - __main__ - Step: 5460 Loss: 10.1261 LR: 0.000300 +12/08/2025 02:55:15 - INFO - __main__ - Step: 5470 Loss: 10.1430 LR: 0.000300 +12/08/2025 02:56:36 - INFO - __main__ - Step: 5480 Loss: 10.1367 LR: 0.000300 +12/08/2025 02:57:19 - INFO - __main__ - Step: 5490 Loss: 10.1244 LR: 0.000300 +12/08/2025 02:59:10 - INFO - __main__ - Step: 5500 Loss: 10.0980 LR: 0.000300 +12/08/2025 02:59:10 - INFO - accelerate.accelerator - Saving current state to output/checkpoint-5500 +12/08/2025 02:59:19 - INFO - accelerate.checkpointing - Optimizer state saved in output/checkpoint-5500/optimizer.bin +12/08/2025 02:59:19 - INFO - accelerate.checkpointing - Scheduler state saved in output/checkpoint-5500/scheduler.bin +12/08/2025 02:59:19 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output/checkpoint-5500/sampler.bin +12/08/2025 02:59:19 - INFO - accelerate.checkpointing - Random states saved in output/checkpoint-5500/random_states_0.pkl +12/08/2025 02:59:19 - INFO - __main__ - Saved state to output/checkpoint-5500 +12/08/2025 02:59:19 - INFO - __main__ - Generating videos for validation... +12/08/2025 02:59:19 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.24it/s] +12/08/2025 02:59:25 - INFO - __main__ - Validation videos saved to ./output +12/08/2025 02:59:47 - INFO - __main__ - Step: 5510 Loss: 10.1332 LR: 0.000300 +12/08/2025 03:00:58 - INFO - __main__ - Step: 5520 Loss: 10.1184 LR: 0.000300 +12/08/2025 03:02:43 - INFO - __main__ - Step: 5530 Loss: 10.1325 LR: 0.000300 +12/08/2025 03:03:45 - INFO - __main__ - Step: 5540 Loss: 10.1413 LR: 0.000300 +12/08/2025 03:04:45 - INFO - __main__ - Step: 5550 Loss: 10.1006 LR: 0.000300 +12/08/2025 03:06:39 - INFO - __main__ - Step: 5560 Loss: 10.0909 LR: 0.000300 +12/08/2025 03:07:59 - INFO - __main__ - Step: 5570 Loss: 10.0817 LR: 0.000300 +12/08/2025 03:08:27 - INFO - __main__ - Step: 5580 Loss: 10.0932 LR: 0.000300 +12/08/2025 03:09:54 - INFO - __main__ - Step: 5590 Loss: 10.1127 LR: 0.000300 +12/08/2025 03:11:19 - INFO - __main__ - Step: 5600 Loss: 10.1366 LR: 0.000300 +12/08/2025 03:12:40 - INFO - __main__ - Step: 5610 Loss: 10.0984 LR: 0.000300 diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/requirements.txt b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/files/wandb-metadata.json b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4fcae98150948629a6f45cdf90b56d19024c2fb3 --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-07T16:24:42.825457Z", + "args": [ + "--text_encoder_architecture", + "umt5-base", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "8", + "--video_height", + "64", + "--video_width", + "112", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "16", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11761633955840" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "9xy59bw101r8mxxys00zg7hx7jzj0cxj" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-core.log b/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..6f2ff7dd6a0685183aafa84a81cbf7b1c20306a1 --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-07T16:24:42.895970668Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpnpejymgr/port-1347178.txt","pid":1347178,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-07T16:24:42.897409679Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1347178} +{"time":"2025-12-07T16:24:42.897387421Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1347178-1347434-4013613872/socket","Net":"unix"}} +{"time":"2025-12-07T16:24:43.081112475Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-07T16:24:43.087235082Z","level":"INFO","msg":"handleInformInit: received","streamId":"54o4hegd","id":"1(@)"} +{"time":"2025-12-07T16:24:43.258753348Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"54o4hegd","id":"1(@)"} +{"time":"2025-12-08T03:13:44.147670361Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-internal.log b/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..aaa4207e262b4887b1bbbfa32a036e6cc84b6d1a --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-12-07T16:24:43.087348347Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-07T16:24:43.257553408Z","level":"INFO","msg":"stream: created new stream","id":"54o4hegd"} +{"time":"2025-12-07T16:24:43.257656368Z","level":"INFO","msg":"handler: started","stream_id":"54o4hegd"} +{"time":"2025-12-07T16:24:43.258477453Z","level":"INFO","msg":"sender: started","stream_id":"54o4hegd"} +{"time":"2025-12-07T16:24:43.258440628Z","level":"INFO","msg":"stream: started","id":"54o4hegd"} +{"time":"2025-12-07T16:24:43.25845881Z","level":"INFO","msg":"writer: started","stream_id":"54o4hegd"} +{"time":"2025-12-07T17:12:28.82915883Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/jin-bin/meissonic/54o4hegd/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug.log b/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8040adbfb5270528b2bbf220b0ec72df8d564d3e --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_setup.py:_flush():80] Configure stats pid to 1347178 +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug.log +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251207_162442-54o4hegd/logs/debug-internal.log +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_init.py:init():841] calling init triggers +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-07 16:24:42,828 INFO MainThread:1347178 [wandb_init.py:init():889] starting backend +2025-12-07 16:24:43,081 INFO MainThread:1347178 [wandb_init.py:init():892] sending inform_init request +2025-12-07 16:24:43,085 INFO MainThread:1347178 [wandb_init.py:init():900] backend started and connected +2025-12-07 16:24:43,087 INFO MainThread:1347178 [wandb_init.py:init():970] updated telemetry +2025-12-07 16:24:43,091 INFO MainThread:1347178 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-07 16:24:43,521 INFO MainThread:1347178 [wandb_init.py:init():1041] starting run threads in backend +2025-12-07 16:24:43,630 INFO MainThread:1347178 [wandb_run.py:_console_start():2521] atexit reg +2025-12-07 16:24:43,630 INFO MainThread:1347178 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-07 16:24:43,630 INFO MainThread:1347178 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-07 16:24:43,630 INFO MainThread:1347178 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-07 16:24:43,633 INFO MainThread:1347178 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-07 16:24:43,634 INFO MainThread:1347178 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-base', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 8, 'video_height': 64, 'video_width': 112, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251207_162442-54o4hegd/run-54o4hegd.wandb b/Meissonic/wandb/run-20251207_162442-54o4hegd/run-54o4hegd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a9e9dd08f569f54d894a3397dc044c9a0471ca43 --- /dev/null +++ b/Meissonic/wandb/run-20251207_162442-54o4hegd/run-54o4hegd.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce67eacd158fd96dbf7ba1b3fd79a7fb83ceb723f95f1b23c5bcd0e0c2b31a3d +size 9732096 diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/config.yaml b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c232b95c5476b6772c165af7150a97749ce793ff --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/config.yaml @@ -0,0 +1,288 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 4vc7e5b6c9w1c2lhq8r0tju8hogo2vvk: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "8" + - --video_height + - "64" + - --video_width + - "112" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "16" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_64x112_8f_16bs + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11813091573760" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-08T03:17:49.487702Z" + writerId: 4vc7e5b6c9w1c2lhq8r0tju8hogo2vvk + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 8 +output_dir: + value: ./output_64x112_8f_16bs +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 16 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 64 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 112 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/output.log b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d3cf105e64766791897201ff8fdfb3ce18cd132d --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/output.log @@ -0,0 +1,12 @@ +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1350, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 459, in main + raise ValueError(f"For video training, text_encoder_architecture must be 'umt5-base' or 't5', got '{args.text_encoder_architecture}'") +ValueError: For video training, text_encoder_architecture must be 'umt5-base' or 't5', got 'umt5-xxl' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1350, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 459, in main +[rank0]: raise ValueError(f"For video training, text_encoder_architecture must be 'umt5-base' or 't5', got '{args.text_encoder_architecture}'") +[rank0]: ValueError: For video training, text_encoder_architecture must be 'umt5-base' or 't5', got 'umt5-xxl' diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/requirements.txt b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..62bae269aa82a40cac99c983da40ce917b2086d0 --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T03:17:49.487702Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "8", + "--video_height", + "64", + "--video_width", + "112", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "16", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_64x112_8f_16bs", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11813091573760" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "4vc7e5b6c9w1c2lhq8r0tju8hogo2vvk" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/wandb-summary.json b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a620d0c1047a4dd8a400939b6da246ed8063a7 --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0},"_runtime":0} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-core.log b/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..cd6c468851bfc0657488be0e325a6d4d13db691b --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-08T03:17:49.837623537Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpn5to09j1/port-3352340.txt","pid":3352340,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T03:17:49.838956393Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3352340} +{"time":"2025-12-08T03:17:49.839564548Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3352340-3352619-1046294805/socket","Net":"unix"}} +{"time":"2025-12-08T03:17:50.019531422Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T03:17:50.032275187Z","level":"INFO","msg":"handleInformInit: received","streamId":"b01lmzcy","id":"1(@)"} +{"time":"2025-12-08T03:17:50.208038227Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"b01lmzcy","id":"1(@)"} +{"time":"2025-12-08T03:17:50.568307111Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-08T03:17:50.568347689Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-08T03:17:50.56834814Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-08T03:17:50.568413111Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3352340-3352619-1046294805/socket","Net":"unix"}} +{"time":"2025-12-08T03:17:50.568432059Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-08T03:17:51.117622137Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-08T03:17:51.117639621Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-08T03:17:51.117648381Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-internal.log b/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..af1f6c46c08a2d3b327eddbbf062fee8badbce1f --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-08T03:17:50.032379945Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T03:17:50.207002094Z","level":"INFO","msg":"stream: created new stream","id":"b01lmzcy"} +{"time":"2025-12-08T03:17:50.208028973Z","level":"INFO","msg":"stream: started","id":"b01lmzcy"} +{"time":"2025-12-08T03:17:50.208052916Z","level":"INFO","msg":"writer: started","stream_id":"b01lmzcy"} +{"time":"2025-12-08T03:17:50.208514733Z","level":"INFO","msg":"handler: started","stream_id":"b01lmzcy"} +{"time":"2025-12-08T03:17:50.208703909Z","level":"INFO","msg":"sender: started","stream_id":"b01lmzcy"} +{"time":"2025-12-08T03:17:50.568373135Z","level":"INFO","msg":"stream: closing","id":"b01lmzcy"} +{"time":"2025-12-08T03:17:50.986975935Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-08T03:17:51.103719495Z","level":"INFO","msg":"handler: closed","stream_id":"b01lmzcy"} +{"time":"2025-12-08T03:17:51.103789916Z","level":"INFO","msg":"sender: closed","stream_id":"b01lmzcy"} +{"time":"2025-12-08T03:17:51.103797706Z","level":"INFO","msg":"stream: closed","id":"b01lmzcy"} diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug.log b/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3f76c45d429dff618018dc6a9881ec8fad9e71d1 --- /dev/null +++ b/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_setup.py:_flush():80] Configure stats pid to 3352340 +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug.log +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_031749-b01lmzcy/logs/debug-internal.log +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_init.py:init():841] calling init triggers +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 03:17:49,490 INFO MainThread:3352340 [wandb_init.py:init():889] starting backend +2025-12-08 03:17:50,019 INFO MainThread:3352340 [wandb_init.py:init():892] sending inform_init request +2025-12-08 03:17:50,026 INFO MainThread:3352340 [wandb_init.py:init():900] backend started and connected +2025-12-08 03:17:50,029 INFO MainThread:3352340 [wandb_init.py:init():970] updated telemetry +2025-12-08 03:17:50,034 INFO MainThread:3352340 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 03:17:50,441 INFO MainThread:3352340 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 03:17:50,556 INFO MainThread:3352340 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 03:17:50,556 INFO MainThread:3352340 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 03:17:50,556 INFO MainThread:3352340 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 03:17:50,556 INFO MainThread:3352340 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 03:17:50,561 INFO MainThread:3352340 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 03:17:50,562 INFO MainThread:3352340 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_64x112_8f_16bs', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 8, 'video_height': 64, 'video_width': 112, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-08 03:17:50,568 INFO wandb-AsyncioManager-main:3352340 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-08 03:17:50,568 INFO wandb-AsyncioManager-main:3352340 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251208_031749-b01lmzcy/run-b01lmzcy.wandb b/Meissonic/wandb/run-20251208_031749-b01lmzcy/run-b01lmzcy.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d17247516273d99bca5236bc89ab45ec1711ee08 Binary files /dev/null and b/Meissonic/wandb/run-20251208_031749-b01lmzcy/run-b01lmzcy.wandb differ diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/config.yaml b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7de70d2aebda0c0075e2ec276373c62293037fbd --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/config.yaml @@ -0,0 +1,288 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + scyk41pdbji4tjive4amqhjyjga3x3fn: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "8" + - --video_height + - "64" + - --video_width + - "112" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "16" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_64x112_8f_16bs + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11813091741696" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-08T03:20:21.949323Z" + writerId: scyk41pdbji4tjive4amqhjyjga3x3fn + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 8 +output_dir: + value: ./output_64x112_8f_16bs +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 16 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 64 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 112 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/output.log b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..448748f1ff63cfb7139b46a593876fccff8c3d6f --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/output.log @@ -0,0 +1,47 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 70.28it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4988.98it/s] +12/08/2025 03:20:28 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=4, W'=7 +12/08/2025 03:20:28 - INFO - __main__ - Theoretical dimensions: F'=1, H'=4, W'=7 +12/08/2025 03:20:28 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:20:31 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 03:20:47 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:20:47 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1363, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 727, in main + missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2609, in load_state_dict + load(self, state_dict) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2597, in load + load(child, child_state_dict, child_prefix) # noqa: F821 + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2597, in load + load(child, child_state_dict, child_prefix) # noqa: F821 + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2597, in load + load(child, child_state_dict, child_prefix) # noqa: F821 + [Previous line repeated 1 more time] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2580, in load + module._load_from_state_dict( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2487, in _load_from_state_dict + param.copy_(input_param) +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1363, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 727, in main +[rank0]: missing_keys, unexpected_keys = model.backbone.load_state_dict(wan_state_dict, strict=False) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2609, in load_state_dict +[rank0]: load(self, state_dict) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2597, in load +[rank0]: load(child, child_state_dict, child_prefix) # noqa: F821 +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2597, in load +[rank0]: load(child, child_state_dict, child_prefix) # noqa: F821 +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2597, in load +[rank0]: load(child, child_state_dict, child_prefix) # noqa: F821 +[rank0]: [Previous line repeated 1 more time] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2580, in load +[rank0]: module._load_from_state_dict( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2487, in _load_from_state_dict +[rank0]: param.copy_(input_param) +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/requirements.txt b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1b14e19694c5a80362d3f96aeacd62177b111b --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T03:20:21.949323Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "8", + "--video_height", + "64", + "--video_width", + "112", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "16", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_64x112_8f_16bs", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11813091741696" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "scyk41pdbji4tjive4amqhjyjga3x3fn" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/wandb-summary.json b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4dda75148b11087b7ca4d383b6747633b42a57 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":37},"_runtime":37} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-core.log b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..469470d366a736000f6ce838dbc6e686d3f9a069 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-08T03:20:22.018532764Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmprej4fezv/port-3354358.txt","pid":3354358,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T03:20:22.019109108Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3354358} +{"time":"2025-12-08T03:20:22.01910427Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3354358-3354621-2754597729/socket","Net":"unix"}} +{"time":"2025-12-08T03:20:22.203005927Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T03:20:22.208970474Z","level":"INFO","msg":"handleInformInit: received","streamId":"kq3w5uiu","id":"1(@)"} +{"time":"2025-12-08T03:20:22.374155661Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"kq3w5uiu","id":"1(@)"} +{"time":"2025-12-08T03:21:00.090286328Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-08T03:21:00.090341092Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-08T03:21:00.090330468Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-08T03:21:00.090427144Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3354358-3354621-2754597729/socket","Net":"unix"}} +{"time":"2025-12-08T03:21:00.09046078Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-08T03:21:00.489451665Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-08T03:21:00.489471453Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-08T03:21:00.489481117Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-internal.log b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..723993b298ae26734c8c5dfa46d09b1f5911367b --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-08T03:20:22.209111428Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T03:20:22.373964604Z","level":"INFO","msg":"stream: created new stream","id":"kq3w5uiu"} +{"time":"2025-12-08T03:20:22.374049571Z","level":"INFO","msg":"handler: started","stream_id":"kq3w5uiu"} +{"time":"2025-12-08T03:20:22.374147086Z","level":"INFO","msg":"stream: started","id":"kq3w5uiu"} +{"time":"2025-12-08T03:20:22.374162519Z","level":"INFO","msg":"writer: started","stream_id":"kq3w5uiu"} +{"time":"2025-12-08T03:20:22.374162263Z","level":"INFO","msg":"sender: started","stream_id":"kq3w5uiu"} +{"time":"2025-12-08T03:21:00.090348177Z","level":"INFO","msg":"stream: closing","id":"kq3w5uiu"} +{"time":"2025-12-08T03:21:00.366237455Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-08T03:21:00.48656867Z","level":"INFO","msg":"handler: closed","stream_id":"kq3w5uiu"} +{"time":"2025-12-08T03:21:00.486659461Z","level":"INFO","msg":"sender: closed","stream_id":"kq3w5uiu"} +{"time":"2025-12-08T03:21:00.486667719Z","level":"INFO","msg":"stream: closed","id":"kq3w5uiu"} diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug.log b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..c647f07915d9a9c5f822690eee2e20e8fdb8f287 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_setup.py:_flush():80] Configure stats pid to 3354358 +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug.log +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_032021-kq3w5uiu/logs/debug-internal.log +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_init.py:init():841] calling init triggers +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 03:20:21,952 INFO MainThread:3354358 [wandb_init.py:init():889] starting backend +2025-12-08 03:20:22,203 INFO MainThread:3354358 [wandb_init.py:init():892] sending inform_init request +2025-12-08 03:20:22,207 INFO MainThread:3354358 [wandb_init.py:init():900] backend started and connected +2025-12-08 03:20:22,208 INFO MainThread:3354358 [wandb_init.py:init():970] updated telemetry +2025-12-08 03:20:22,212 INFO MainThread:3354358 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 03:20:22,747 INFO MainThread:3354358 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 03:20:22,855 INFO MainThread:3354358 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 03:20:22,855 INFO MainThread:3354358 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 03:20:22,855 INFO MainThread:3354358 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 03:20:22,856 INFO MainThread:3354358 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 03:20:22,858 INFO MainThread:3354358 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 03:20:22,859 INFO MainThread:3354358 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_64x112_8f_16bs', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 8, 'video_height': 64, 'video_width': 112, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-08 03:21:00,090 INFO wandb-AsyncioManager-main:3354358 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-08 03:21:00,090 INFO wandb-AsyncioManager-main:3354358 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251208_032021-kq3w5uiu/run-kq3w5uiu.wandb b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/run-kq3w5uiu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4fac154706520195a8d0616cdfd6958d9c04cf19 Binary files /dev/null and b/Meissonic/wandb/run-20251208_032021-kq3w5uiu/run-kq3w5uiu.wandb differ diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/config.yaml b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4c4ef26c685d02aa0488f7e22179cb720133c48 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/config.yaml @@ -0,0 +1,288 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + xtt0igej7usiniueah5c5vplpej2u7p3: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "9" + - --video_height + - "64" + - --video_width + - "112" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-1.0-Tokenizer-DV8x16x16 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "16" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_64x112_8f_16bs + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11813091844096" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-08T03:21:17.884575Z" + writerId: xtt0igej7usiniueah5c5vplpej2u7p3 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 9 +output_dir: + value: ./output_64x112_8f_16bs +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 16 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 64 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 112 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/output.log b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..cafb69ae962b54d49a6b619869fc63bb07ab52f6 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/output.log @@ -0,0 +1,39 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 69.36it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5493.01it/s] +12/08/2025 03:21:22 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=4, W'=7 +12/08/2025 03:21:22 - INFO - __main__ - Theoretical dimensions: F'=1, H'=4, W'=7 +12/08/2025 03:21:22 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:21:23 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 03:21:38 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:21:38 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:22:05 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 03:22:08 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 03:22:15 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 03:22:15 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 03:22:15 - INFO - __main__ - Dataloader configuration: +12/08/2025 03:22:15 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 03:22:15 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 03:22:15 - INFO - __main__ - - persistent_workers: True +12/08/2025 03:22:15 - INFO - __main__ - - pin_memory: True +12/08/2025 03:22:15 - INFO - __main__ - Preparing model, optimizer and dataloaders +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1363, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1033, in main + text_encoder, tokenize_prompt(tokenizer, "", args.text_encoder_architecture).to(accelerator.device, non_blocking=True), args.text_encoder_architecture + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/train/dataset_utils.py", line 69, in tokenize_prompt + raise ValueError(f"Unknown text_encoder_architecture: {text_encoder_architecture}") +ValueError: Unknown text_encoder_architecture: umt5-xxl +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1363, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1033, in main +[rank0]: text_encoder, tokenize_prompt(tokenizer, "", args.text_encoder_architecture).to(accelerator.device, non_blocking=True), args.text_encoder_architecture +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 69, in tokenize_prompt +[rank0]: raise ValueError(f"Unknown text_encoder_architecture: {text_encoder_architecture}") +[rank0]: ValueError: Unknown text_encoder_architecture: umt5-xxl diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/requirements.txt b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..13db8eae4ea4367e9ba279b2e0f2107f82212614 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T03:21:17.884575Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "9", + "--video_height", + "64", + "--video_width", + "112", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "16", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_64x112_8f_16bs", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11813091844096" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "xtt0igej7usiniueah5c5vplpej2u7p3" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/wandb-summary.json b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f216acddd6fec52473bb964018993e25ebe701aa --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":232},"_runtime":232} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-core.log b/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..1949a2955f6458833dad3423485a006497865521 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-08T03:21:17.951376965Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp11o8j3tm/port-3355453.txt","pid":3355453,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T03:21:17.951948637Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3355453} +{"time":"2025-12-08T03:21:17.951960045Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3355453-3355707-2981470890/socket","Net":"unix"}} +{"time":"2025-12-08T03:21:18.137503071Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T03:21:18.143619116Z","level":"INFO","msg":"handleInformInit: received","streamId":"84ktbwyp","id":"1(@)"} +{"time":"2025-12-08T03:21:18.319484041Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"84ktbwyp","id":"1(@)"} +{"time":"2025-12-08T03:25:10.729670716Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-08T03:25:10.729715992Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-08T03:25:10.729712192Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-08T03:25:10.729766051Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-08T03:25:10.729807052Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3355453-3355707-2981470890/socket","Net":"unix"}} +{"time":"2025-12-08T03:25:11.116771304Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-08T03:25:11.116796954Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-08T03:25:11.116808113Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-internal.log b/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..4ca9987714929550356a175e5b0fc2da35af7ea8 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-08T03:21:18.143764029Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T03:21:18.31920589Z","level":"INFO","msg":"stream: created new stream","id":"84ktbwyp"} +{"time":"2025-12-08T03:21:18.319349929Z","level":"INFO","msg":"handler: started","stream_id":"84ktbwyp"} +{"time":"2025-12-08T03:21:18.319476317Z","level":"INFO","msg":"stream: started","id":"84ktbwyp"} +{"time":"2025-12-08T03:21:18.319498724Z","level":"INFO","msg":"writer: started","stream_id":"84ktbwyp"} +{"time":"2025-12-08T03:21:18.319499946Z","level":"INFO","msg":"sender: started","stream_id":"84ktbwyp"} +{"time":"2025-12-08T03:25:10.729723902Z","level":"INFO","msg":"stream: closing","id":"84ktbwyp"} +{"time":"2025-12-08T03:25:10.979653379Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-08T03:25:11.113672154Z","level":"INFO","msg":"handler: closed","stream_id":"84ktbwyp"} +{"time":"2025-12-08T03:25:11.113797561Z","level":"INFO","msg":"sender: closed","stream_id":"84ktbwyp"} +{"time":"2025-12-08T03:25:11.113814607Z","level":"INFO","msg":"stream: closed","id":"84ktbwyp"} diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug.log b/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..79953f52c15dc1ccccc0fa0c586299b6452c0bd6 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_setup.py:_flush():80] Configure stats pid to 3355453 +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug.log +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_032117-84ktbwyp/logs/debug-internal.log +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_init.py:init():841] calling init triggers +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 03:21:17,887 INFO MainThread:3355453 [wandb_init.py:init():889] starting backend +2025-12-08 03:21:18,137 INFO MainThread:3355453 [wandb_init.py:init():892] sending inform_init request +2025-12-08 03:21:18,142 INFO MainThread:3355453 [wandb_init.py:init():900] backend started and connected +2025-12-08 03:21:18,143 INFO MainThread:3355453 [wandb_init.py:init():970] updated telemetry +2025-12-08 03:21:18,147 INFO MainThread:3355453 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 03:21:18,482 INFO MainThread:3355453 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 03:21:18,596 INFO MainThread:3355453 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 03:21:18,596 INFO MainThread:3355453 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 03:21:18,596 INFO MainThread:3355453 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 03:21:18,596 INFO MainThread:3355453 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 03:21:18,599 INFO MainThread:3355453 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 03:21:18,600 INFO MainThread:3355453 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_64x112_8f_16bs', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 9, 'video_height': 64, 'video_width': 112, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-08 03:25:10,729 INFO wandb-AsyncioManager-main:3355453 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-08 03:25:10,730 INFO wandb-AsyncioManager-main:3355453 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251208_032117-84ktbwyp/run-84ktbwyp.wandb b/Meissonic/wandb/run-20251208_032117-84ktbwyp/run-84ktbwyp.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a5a2ebb1a6e29d22fd7c4da5fc61b6344dbf184a Binary files /dev/null and b/Meissonic/wandb/run-20251208_032117-84ktbwyp/run-84ktbwyp.wandb differ diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/output.log b/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6923566ba663a19ad342f1f28a6d676994754e10 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/output.log @@ -0,0 +1,56 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 67.76it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 03:29:58 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7052.64it/s] +12/08/2025 03:30:00 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=4, W'=7 +12/08/2025 03:30:00 - INFO - __main__ - Theoretical dimensions: F'=1, H'=4, W'=7 +12/08/2025 03:30:00 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:30:00 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 03:30:17 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:30:17 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 03:30:19 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 03:30:22 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 03:30:29 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 03:30:29 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 03:30:29 - INFO - __main__ - Dataloader configuration: +12/08/2025 03:30:29 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 03:30:29 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 03:30:29 - INFO - __main__ - - persistent_workers: True +12/08/2025 03:30:29 - INFO - __main__ - - pin_memory: True +12/08/2025 03:30:29 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 03:30:47 - INFO - __main__ - ***** Running training ***** +12/08/2025 03:30:47 - INFO - __main__ - Num training steps = 10000 +12/08/2025 03:30:47 - INFO - __main__ - Instantaneous batch size per device = 16 +12/08/2025 03:30:47 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/08/2025 03:30:47 - INFO - __main__ - Gradient Accumulation steps = 1 +12/08/2025 03:33:30 - INFO - __main__ - Step: 10 Loss: 11.1033 LR: 0.000300 +12/08/2025 03:34:08 - INFO - __main__ - Step: 20 Loss: 11.1034 LR: 0.000300 +12/08/2025 03:35:23 - INFO - __main__ - Step: 30 Loss: 11.1034 LR: 0.000300 +12/08/2025 03:36:15 - INFO - __main__ - Step: 40 Loss: 11.0977 LR: 0.000300 +12/08/2025 03:37:58 - INFO - __main__ - Step: 50 Loss: 11.0971 LR: 0.000300 +12/08/2025 03:38:36 - INFO - __main__ - Step: 60 Loss: 11.0946 LR: 0.000300 +12/08/2025 03:39:35 - INFO - __main__ - Step: 70 Loss: 11.0899 LR: 0.000300 +12/08/2025 03:40:55 - INFO - __main__ - Step: 80 Loss: 11.0908 LR: 0.000300 +12/08/2025 03:42:24 - INFO - __main__ - Step: 90 Loss: 11.0784 LR: 0.000300 +12/08/2025 03:43:06 - INFO - __main__ - Step: 100 Loss: 11.0728 LR: 0.000300 +12/08/2025 03:44:02 - INFO - __main__ - Step: 110 Loss: 11.0524 LR: 0.000300 +12/08/2025 03:45:19 - INFO - __main__ - Step: 120 Loss: 11.0345 LR: 0.000300 +12/08/2025 03:46:17 - INFO - __main__ - Step: 130 Loss: 11.0061 LR: 0.000300 +12/08/2025 03:47:33 - INFO - __main__ - Step: 140 Loss: 10.9730 LR: 0.000300 +12/08/2025 03:48:25 - INFO - __main__ - Step: 150 Loss: 10.9452 LR: 0.000300 +12/08/2025 03:49:34 - INFO - __main__ - Step: 160 Loss: 10.9156 LR: 0.000300 +12/08/2025 03:50:30 - INFO - __main__ - Step: 170 Loss: 10.8766 LR: 0.000300 +12/08/2025 03:51:28 - INFO - __main__ - Step: 180 Loss: 10.8338 LR: 0.000300 +12/08/2025 03:52:38 - INFO - __main__ - Step: 190 Loss: 10.8179 LR: 0.000300 +12/08/2025 03:53:55 - INFO - __main__ - Step: 200 Loss: 10.7630 LR: 0.000300 +12/08/2025 03:54:34 - INFO - __main__ - Step: 210 Loss: 10.7580 LR: 0.000300 +12/08/2025 03:56:04 - INFO - __main__ - Step: 220 Loss: 10.7254 LR: 0.000300 +12/08/2025 03:56:56 - INFO - __main__ - Step: 230 Loss: 10.6992 LR: 0.000300 +12/08/2025 03:58:10 - INFO - __main__ - Step: 240 Loss: 10.6697 LR: 0.000300 +12/08/2025 03:59:41 - INFO - __main__ - Step: 250 Loss: 10.6544 LR: 0.000300 +12/08/2025 04:00:36 - INFO - __main__ - Step: 260 Loss: 10.6203 LR: 0.000300 +12/08/2025 04:01:27 - INFO - __main__ - Step: 270 Loss: 10.6068 LR: 0.000300 +12/08/2025 04:02:38 - INFO - __main__ - Step: 280 Loss: 10.5423 LR: 0.000300 +12/08/2025 04:03:58 - INFO - __main__ - Step: 290 Loss: 10.5799 LR: 0.000300 +12/08/2025 04:04:48 - INFO - __main__ - Step: 300 Loss: 10.5523 LR: 0.000300 +12/08/2025 04:05:32 - INFO - __main__ - Step: 310 Loss: 10.5698 LR: 0.000300 diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/requirements.txt b/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5a0f6f92c8e4228d6514ec50900dac905b812ee6 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T03:29:55.147354Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "9", + "--video_height", + "64", + "--video_width", + "112", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "16", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_64x112_8f_16bs", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11813092118528" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "9ewipho8y9ufx2kaipiagam4kqn0wsgz" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-core.log b/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..501d0ae2da501a7a12c508ef69d9ae22202576e2 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-08T03:29:55.218073996Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpi3vrch57/port-3361354.txt","pid":3361354,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T03:29:55.218523005Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3361354} +{"time":"2025-12-08T03:29:55.218497187Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3361354-3361609-1392137282/socket","Net":"unix"}} +{"time":"2025-12-08T03:29:55.40139843Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T03:29:55.407669546Z","level":"INFO","msg":"handleInformInit: received","streamId":"tl61pd0t","id":"1(@)"} +{"time":"2025-12-08T03:29:55.578988829Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tl61pd0t","id":"1(@)"} +{"time":"2025-12-08T04:05:43.832486673Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-internal.log b/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..df455bf5d3b8c5d6633bb8d6ca582dbddb7dd988 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-08T03:29:55.407753699Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T03:29:55.578794326Z","level":"INFO","msg":"stream: created new stream","id":"tl61pd0t"} +{"time":"2025-12-08T03:29:55.578885522Z","level":"INFO","msg":"handler: started","stream_id":"tl61pd0t"} +{"time":"2025-12-08T03:29:55.578981351Z","level":"INFO","msg":"stream: started","id":"tl61pd0t"} +{"time":"2025-12-08T03:29:55.578999079Z","level":"INFO","msg":"writer: started","stream_id":"tl61pd0t"} +{"time":"2025-12-08T03:29:55.578999792Z","level":"INFO","msg":"sender: started","stream_id":"tl61pd0t"} diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug.log b/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..747d6fe5211bbb35f431fea0b7ae1cf5cce11de0 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-08 03:29:55,149 INFO MainThread:3361354 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 03:29:55,149 INFO MainThread:3361354 [wandb_setup.py:_flush():80] Configure stats pid to 3361354 +2025-12-08 03:29:55,149 INFO MainThread:3361354 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug.log +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_032955-tl61pd0t/logs/debug-internal.log +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_init.py:init():841] calling init triggers +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 03:29:55,150 INFO MainThread:3361354 [wandb_init.py:init():889] starting backend +2025-12-08 03:29:55,401 INFO MainThread:3361354 [wandb_init.py:init():892] sending inform_init request +2025-12-08 03:29:55,406 INFO MainThread:3361354 [wandb_init.py:init():900] backend started and connected +2025-12-08 03:29:55,408 INFO MainThread:3361354 [wandb_init.py:init():970] updated telemetry +2025-12-08 03:29:55,412 INFO MainThread:3361354 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 03:29:55,879 INFO MainThread:3361354 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 03:29:55,988 INFO MainThread:3361354 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 03:29:55,988 INFO MainThread:3361354 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 03:29:55,988 INFO MainThread:3361354 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 03:29:55,988 INFO MainThread:3361354 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 03:29:55,991 INFO MainThread:3361354 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 03:29:55,991 INFO MainThread:3361354 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_64x112_8f_16bs', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 9, 'video_height': 64, 'video_width': 112, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251208_032955-tl61pd0t/run-tl61pd0t.wandb b/Meissonic/wandb/run-20251208_032955-tl61pd0t/run-tl61pd0t.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e0d555864914635b85a37110a9c99bfadac4cee8 --- /dev/null +++ b/Meissonic/wandb/run-20251208_032955-tl61pd0t/run-tl61pd0t.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e6df6c74d58c6d4c24a7aa545f9c7c8f5fe65622e55dfd93543de7a3613fbe2 +size 524288 diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/output.log b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8928b902d0409bf937037cf6a41de48ce630f38e --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/output.log @@ -0,0 +1,239 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 65.76it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 04:06:09 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5144.58it/s] +12/08/2025 04:06:11 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=16, W'=28 +12/08/2025 04:06:11 - INFO - __main__ - Theoretical dimensions: F'=1, H'=16, W'=28 +12/08/2025 04:06:11 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 04:06:11 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 04:06:28 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 04:06:28 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 04:06:30 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 04:06:32 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 04:06:39 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 04:06:39 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 04:06:39 - INFO - __main__ - Dataloader configuration: +12/08/2025 04:06:39 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 04:06:39 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 04:06:39 - INFO - __main__ - - persistent_workers: True +12/08/2025 04:06:39 - INFO - __main__ - - pin_memory: True +12/08/2025 04:06:39 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 04:06:55 - INFO - __main__ - ***** Running training ***** +12/08/2025 04:06:55 - INFO - __main__ - Num training steps = 10000 +12/08/2025 04:06:55 - INFO - __main__ - Instantaneous batch size per device = 4 +12/08/2025 04:06:55 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 32 +12/08/2025 04:06:55 - INFO - __main__ - Gradient Accumulation steps = 1 +12/08/2025 04:07:29 - INFO - __main__ - Step: 10 Loss: 11.0996 LR: 0.000300 +12/08/2025 04:07:43 - INFO - __main__ - Step: 20 Loss: 11.0992 LR: 0.000300 +12/08/2025 04:07:56 - INFO - __main__ - Step: 30 Loss: 11.0959 LR: 0.000300 +12/08/2025 04:08:11 - INFO - __main__ - Step: 40 Loss: 11.0928 LR: 0.000300 +12/08/2025 04:08:25 - INFO - __main__ - Step: 50 Loss: 11.0939 LR: 0.000300 +12/08/2025 04:08:38 - INFO - __main__ - Step: 60 Loss: 11.0928 LR: 0.000300 +12/08/2025 04:08:53 - INFO - __main__ - Step: 70 Loss: 11.0855 LR: 0.000300 +12/08/2025 04:09:08 - INFO - __main__ - Step: 80 Loss: 11.0847 LR: 0.000300 +12/08/2025 04:09:23 - INFO - __main__ - Step: 90 Loss: 11.0807 LR: 0.000300 +12/08/2025 04:09:37 - INFO - __main__ - Step: 100 Loss: 11.0708 LR: 0.000300 +12/08/2025 04:09:51 - INFO - __main__ - Step: 110 Loss: 11.0565 LR: 0.000300 +12/08/2025 04:10:06 - INFO - __main__ - Step: 120 Loss: 11.0333 LR: 0.000300 +12/08/2025 04:10:20 - INFO - __main__ - Step: 130 Loss: 11.0024 LR: 0.000300 +12/08/2025 04:10:35 - INFO - __main__ - Step: 140 Loss: 10.9827 LR: 0.000300 +12/08/2025 04:10:49 - INFO - __main__ - Step: 150 Loss: 10.9469 LR: 0.000300 +12/08/2025 04:11:04 - INFO - __main__ - Step: 160 Loss: 10.9046 LR: 0.000300 +12/08/2025 04:11:18 - INFO - __main__ - Step: 170 Loss: 10.8506 LR: 0.000300 +12/08/2025 04:11:33 - INFO - __main__ - Step: 180 Loss: 10.8237 LR: 0.000300 +12/08/2025 04:11:47 - INFO - __main__ - Step: 190 Loss: 10.8008 LR: 0.000300 +12/08/2025 04:12:02 - INFO - __main__ - Step: 200 Loss: 10.7369 LR: 0.000300 +12/08/2025 04:12:17 - INFO - __main__ - Step: 210 Loss: 10.7677 LR: 0.000300 +12/08/2025 04:12:32 - INFO - __main__ - Step: 220 Loss: 10.7119 LR: 0.000300 +12/08/2025 04:12:47 - INFO - __main__ - Step: 230 Loss: 10.6432 LR: 0.000300 +12/08/2025 04:13:01 - INFO - __main__ - Step: 240 Loss: 10.6370 LR: 0.000300 +12/08/2025 04:13:15 - INFO - __main__ - Step: 250 Loss: 10.6359 LR: 0.000300 +12/08/2025 04:13:29 - INFO - __main__ - Step: 260 Loss: 10.5410 LR: 0.000300 +12/08/2025 04:13:43 - INFO - __main__ - Step: 270 Loss: 10.5677 LR: 0.000300 +12/08/2025 04:13:58 - INFO - __main__ - Step: 280 Loss: 10.5715 LR: 0.000300 +12/08/2025 04:14:13 - INFO - __main__ - Step: 290 Loss: 10.4722 LR: 0.000300 +12/08/2025 04:14:29 - INFO - __main__ - Step: 300 Loss: 10.5197 LR: 0.000300 +12/08/2025 04:14:43 - INFO - __main__ - Step: 310 Loss: 10.4737 LR: 0.000300 +12/08/2025 04:14:58 - INFO - __main__ - Step: 320 Loss: 10.4552 LR: 0.000300 +12/08/2025 04:15:13 - INFO - __main__ - Step: 330 Loss: 10.5440 LR: 0.000300 +12/08/2025 04:15:28 - INFO - __main__ - Step: 340 Loss: 10.4659 LR: 0.000300 +12/08/2025 04:15:44 - INFO - __main__ - Step: 350 Loss: 10.4662 LR: 0.000300 +12/08/2025 04:15:59 - INFO - __main__ - Step: 360 Loss: 10.4312 LR: 0.000300 +12/08/2025 04:16:14 - INFO - __main__ - Step: 370 Loss: 10.4508 LR: 0.000300 +12/08/2025 04:16:28 - INFO - __main__ - Step: 380 Loss: 10.4507 LR: 0.000300 +12/08/2025 04:16:43 - INFO - __main__ - Step: 390 Loss: 10.4685 LR: 0.000300 +12/08/2025 04:16:59 - INFO - __main__ - Step: 400 Loss: 10.3776 LR: 0.000300 +12/08/2025 04:17:14 - INFO - __main__ - Step: 410 Loss: 10.4464 LR: 0.000300 +12/08/2025 04:17:28 - INFO - __main__ - Step: 420 Loss: 10.3895 LR: 0.000300 +12/08/2025 04:17:44 - INFO - __main__ - Step: 430 Loss: 10.3926 LR: 0.000300 +12/08/2025 04:17:58 - INFO - __main__ - Step: 440 Loss: 10.4312 LR: 0.000300 +12/08/2025 04:18:14 - INFO - __main__ - Step: 450 Loss: 10.3412 LR: 0.000300 +12/08/2025 04:18:29 - INFO - __main__ - Step: 460 Loss: 10.2831 LR: 0.000300 +12/08/2025 04:18:44 - INFO - __main__ - Step: 470 Loss: 10.2265 LR: 0.000300 +12/08/2025 04:18:59 - INFO - __main__ - Step: 480 Loss: 10.3602 LR: 0.000300 +12/08/2025 04:19:15 - INFO - __main__ - Step: 490 Loss: 10.3724 LR: 0.000300 +12/08/2025 04:19:30 - INFO - __main__ - Step: 500 Loss: 10.3895 LR: 0.000300 +12/08/2025 04:19:30 - INFO - accelerate.accelerator - Saving current state to output_256x448_9f_4bs/checkpoint-500 +12/08/2025 04:19:41 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_9f_4bs/checkpoint-500/optimizer.bin +12/08/2025 04:19:41 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_9f_4bs/checkpoint-500/scheduler.bin +12/08/2025 04:19:41 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_9f_4bs/checkpoint-500/sampler.bin +12/08/2025 04:19:41 - INFO - accelerate.checkpointing - Random states saved in output_256x448_9f_4bs/checkpoint-500/random_states_0.pkl +12/08/2025 04:19:41 - INFO - __main__ - Saved state to output_256x448_9f_4bs/checkpoint-500 +12/08/2025 04:19:41 - INFO - __main__ - Generating videos for validation... +12/08/2025 04:19:41 - INFO - __main__ - Generating videos for validation... +12/08/2025 04:19:41 - ERROR - __main__ - Video validation failed: num_frames (9) must be divisible by temporal downsampling factor (8) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1278, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 541, in __call__ + raise ValueError( +ValueError: num_frames (9) must be divisible by temporal downsampling factor (8) +12/08/2025 04:19:57 - INFO - __main__ - Step: 510 Loss: 10.3469 LR: 0.000300 +12/08/2025 04:20:12 - INFO - __main__ - Step: 520 Loss: 10.3415 LR: 0.000300 +12/08/2025 04:20:27 - INFO - __main__ - Step: 530 Loss: 10.4008 LR: 0.000300 +12/08/2025 04:20:42 - INFO - __main__ - Step: 540 Loss: 10.2568 LR: 0.000300 +12/08/2025 04:20:58 - INFO - __main__ - Step: 550 Loss: 10.3286 LR: 0.000300 +12/08/2025 04:21:13 - INFO - __main__ - Step: 560 Loss: 10.3954 LR: 0.000300 +12/08/2025 04:21:28 - INFO - __main__ - Step: 570 Loss: 10.3188 LR: 0.000300 +12/08/2025 04:21:42 - INFO - __main__ - Step: 580 Loss: 10.3820 LR: 0.000300 +12/08/2025 04:21:58 - INFO - __main__ - Step: 590 Loss: 10.3850 LR: 0.000300 +12/08/2025 04:22:12 - INFO - __main__ - Step: 600 Loss: 10.3458 LR: 0.000300 +12/08/2025 04:22:26 - INFO - __main__ - Step: 610 Loss: 10.3710 LR: 0.000300 +12/08/2025 04:22:41 - INFO - __main__ - Step: 620 Loss: 10.3231 LR: 0.000300 +12/08/2025 04:22:56 - INFO - __main__ - Step: 630 Loss: 10.3351 LR: 0.000300 +12/08/2025 04:23:10 - INFO - __main__ - Step: 640 Loss: 10.3568 LR: 0.000300 +12/08/2025 04:23:25 - INFO - __main__ - Step: 650 Loss: 10.2831 LR: 0.000300 +12/08/2025 04:23:39 - INFO - __main__ - Step: 660 Loss: 10.2838 LR: 0.000300 +12/08/2025 04:23:54 - INFO - __main__ - Step: 670 Loss: 10.2400 LR: 0.000300 +12/08/2025 04:24:08 - INFO - __main__ - Step: 680 Loss: 10.3153 LR: 0.000300 +12/08/2025 04:24:23 - INFO - __main__ - Step: 690 Loss: 10.3624 LR: 0.000300 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1359, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1171, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 359, in forward + k=rope_apply(k, grid_sizes, freqs), + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 251, in rope_apply + x_i = torch.view_as_real(x_i * freqs_i).flatten(2) +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1359, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1171, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 359, in forward +[rank0]: k=rope_apply(k, grid_sizes, freqs), +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 251, in rope_apply +[rank0]: x_i = torch.view_as_real(x_i * freqs_i).flatten(2) +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/requirements.txt b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..734c2a1d156069567b4cb1d81281a51800cb52dc --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T04:06:06.027369Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "9", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-1.0-Tokenizer-DV8x16x16", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_9f_4bs", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11813092904960" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "ye6zoy1wxwaa4ivuuvr0hgyyqpzf7o03" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-core.log b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9b7270216a8559815b2a9ea42a7e7dc038786a44 --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-08T04:06:06.095985831Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpw7aanfnn/port-290084.txt","pid":290084,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T04:06:06.096562812Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":290084} +{"time":"2025-12-08T04:06:06.096546963Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-290084-290343-112641643/socket","Net":"unix"}} +{"time":"2025-12-08T04:06:06.28262354Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T04:06:06.288971188Z","level":"INFO","msg":"handleInformInit: received","streamId":"2dcjc9k8","id":"1(@)"} +{"time":"2025-12-08T04:06:06.457234565Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2dcjc9k8","id":"1(@)"} +{"time":"2025-12-08T04:24:37.707615163Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-internal.log b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e4e51aed8f34e9667f6424ba1e0ebd28921cfbc9 --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-08T04:06:06.289151581Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T04:06:06.457022976Z","level":"INFO","msg":"stream: created new stream","id":"2dcjc9k8"} +{"time":"2025-12-08T04:06:06.457113741Z","level":"INFO","msg":"handler: started","stream_id":"2dcjc9k8"} +{"time":"2025-12-08T04:06:06.457227301Z","level":"INFO","msg":"stream: started","id":"2dcjc9k8"} +{"time":"2025-12-08T04:06:06.457255259Z","level":"INFO","msg":"sender: started","stream_id":"2dcjc9k8"} +{"time":"2025-12-08T04:06:06.457255856Z","level":"INFO","msg":"writer: started","stream_id":"2dcjc9k8"} diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug.log b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..adc7737841c2a8b057f943ffb2cfa61188b97667 --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_setup.py:_flush():80] Configure stats pid to 290084 +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug.log +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_040606-2dcjc9k8/logs/debug-internal.log +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_init.py:init():841] calling init triggers +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 04:06:06,030 INFO MainThread:290084 [wandb_init.py:init():889] starting backend +2025-12-08 04:06:06,282 INFO MainThread:290084 [wandb_init.py:init():892] sending inform_init request +2025-12-08 04:06:06,287 INFO MainThread:290084 [wandb_init.py:init():900] backend started and connected +2025-12-08 04:06:06,289 INFO MainThread:290084 [wandb_init.py:init():970] updated telemetry +2025-12-08 04:06:06,294 INFO MainThread:290084 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 04:06:06,688 INFO MainThread:290084 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 04:06:06,801 INFO MainThread:290084 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 04:06:06,801 INFO MainThread:290084 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 04:06:06,801 INFO MainThread:290084 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 04:06:06,801 INFO MainThread:290084 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 04:06:06,804 INFO MainThread:290084 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 04:06:06,805 INFO MainThread:290084 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_9f_4bs', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 9, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251208_040606-2dcjc9k8/run-2dcjc9k8.wandb b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/run-2dcjc9k8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..28c08b9445ef1c25f618a4d59c79deda77792616 --- /dev/null +++ b/Meissonic/wandb/run-20251208_040606-2dcjc9k8/run-2dcjc9k8.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7495b8eff0e7f0c259b20ef00d19656f0b2b1fc4e51d4cd5b0b128b6a8c336a1 +size 294912 diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/config.yaml b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..72dcfe99ffb21617e9a63ee8bd61d04914b4f296 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/config.yaml @@ -0,0 +1,288 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + i7ve4wwwnqexgpibqdpn8euqx2nyjccr: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --num_frames + - "8" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "4" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_9f_4bs + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11819720794112" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-08T06:24:56.224488Z" + writerId: i7ve4wwwnqexgpibqdpn8euqx2nyjccr + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 8 +output_dir: + value: ./output_256x448_9f_4bs +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 4 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/output.log b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..132972af43972ad88252665daaffd30c24756331 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/output.log @@ -0,0 +1,53 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 69.55it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 06:24:59 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4788.02it/s] +12/08/2025 06:25:01 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=3, H'=32, W'=56 +12/08/2025 06:25:01 - INFO - __main__ - Theoretical dimensions: F'=1, H'=16, W'=28 +12/08/2025 06:25:01 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 06:25:01 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 06:25:16 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 06:25:16 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 06:25:18 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 06:25:20 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 06:25:27 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 06:25:27 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 06:25:27 - INFO - __main__ - Dataloader configuration: +12/08/2025 06:25:27 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 06:25:27 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 06:25:27 - INFO - __main__ - - persistent_workers: True +12/08/2025 06:25:27 - INFO - __main__ - - pin_memory: True +12/08/2025 06:25:27 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 06:25:45 - INFO - __main__ - ***** Running training ***** +12/08/2025 06:25:45 - INFO - __main__ - Num training steps = 10000 +12/08/2025 06:25:45 - INFO - __main__ - Instantaneous batch size per device = 4 +12/08/2025 06:25:45 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 32 +12/08/2025 06:25:45 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1359, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1207, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 5.25 GiB. GPU 0 has a total capacity of 39.49 GiB of which 3.25 GiB is free. Process 988220 has 414.00 MiB memory in use. Process 988225 has 414.00 MiB memory in use. Process 988223 has 414.00 MiB memory in use. Process 988224 has 414.00 MiB memory in use. Process 988219 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 33.35 GiB memory in use. Process 988222 has 414.00 MiB memory in use. Process 988221 has 414.00 MiB memory in use. Of the allocated memory 31.91 GiB is allocated by PyTorch, and 349.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1359, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1207, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 5.25 GiB. GPU 0 has a total capacity of 39.49 GiB of which 3.25 GiB is free. Process 988220 has 414.00 MiB memory in use. Process 988225 has 414.00 MiB memory in use. Process 988223 has 414.00 MiB memory in use. Process 988224 has 414.00 MiB memory in use. Process 988219 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 33.35 GiB memory in use. Process 988222 has 414.00 MiB memory in use. Process 988221 has 414.00 MiB memory in use. Of the allocated memory 31.91 GiB is allocated by PyTorch, and 349.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/requirements.txt b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..25bb324183c438189f4c3ed7f5e4731d6cc736b4 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T06:24:56.224488Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "8", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "4", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_9f_4bs", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11819720794112" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "i7ve4wwwnqexgpibqdpn8euqx2nyjccr" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/wandb-summary.json b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f6a58e23e676a0a08733884f4e00d6ed642aea39 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":112},"_runtime":112} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-core.log b/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9bac9286da012787fc6603ca767e04e16f7fc582 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-08T06:24:56.295049346Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp481wqo3r/port-988218.txt","pid":988218,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T06:24:56.295513671Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":988218} +{"time":"2025-12-08T06:24:56.29552614Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-988218-988475-3388223444/socket","Net":"unix"}} +{"time":"2025-12-08T06:24:56.480225837Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T06:24:56.48670425Z","level":"INFO","msg":"handleInformInit: received","streamId":"qz69oyz3","id":"1(@)"} +{"time":"2025-12-08T06:24:56.652693188Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"qz69oyz3","id":"1(@)"} +{"time":"2025-12-08T06:26:48.94969808Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-08T06:26:48.949777226Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-08T06:26:48.949758796Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-08T06:26:48.949872901Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-08T06:26:48.949881001Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-988218-988475-3388223444/socket","Net":"unix"}} +{"time":"2025-12-08T06:26:49.346344105Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-08T06:26:49.346375364Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-08T06:26:49.346388259Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-internal.log b/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3346f6c5ea0b042df17cc359bec3a89f2c99258e --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-08T06:24:56.486803829Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T06:24:56.652468229Z","level":"INFO","msg":"stream: created new stream","id":"qz69oyz3"} +{"time":"2025-12-08T06:24:56.652567387Z","level":"INFO","msg":"handler: started","stream_id":"qz69oyz3"} +{"time":"2025-12-08T06:24:56.652684577Z","level":"INFO","msg":"stream: started","id":"qz69oyz3"} +{"time":"2025-12-08T06:24:56.65272407Z","level":"INFO","msg":"writer: started","stream_id":"qz69oyz3"} +{"time":"2025-12-08T06:24:56.652727637Z","level":"INFO","msg":"sender: started","stream_id":"qz69oyz3"} +{"time":"2025-12-08T06:26:48.949782152Z","level":"INFO","msg":"stream: closing","id":"qz69oyz3"} +{"time":"2025-12-08T06:26:49.210637319Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-08T06:26:49.342287919Z","level":"INFO","msg":"handler: closed","stream_id":"qz69oyz3"} +{"time":"2025-12-08T06:26:49.342363249Z","level":"INFO","msg":"sender: closed","stream_id":"qz69oyz3"} +{"time":"2025-12-08T06:26:49.342374552Z","level":"INFO","msg":"stream: closed","id":"qz69oyz3"} diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug.log b/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0964d593925c1ffa8b437f17424ea8615fe235f4 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_setup.py:_flush():80] Configure stats pid to 988218 +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug.log +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_062456-qz69oyz3/logs/debug-internal.log +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_init.py:init():841] calling init triggers +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 06:24:56,227 INFO MainThread:988218 [wandb_init.py:init():889] starting backend +2025-12-08 06:24:56,480 INFO MainThread:988218 [wandb_init.py:init():892] sending inform_init request +2025-12-08 06:24:56,485 INFO MainThread:988218 [wandb_init.py:init():900] backend started and connected +2025-12-08 06:24:56,486 INFO MainThread:988218 [wandb_init.py:init():970] updated telemetry +2025-12-08 06:24:56,491 INFO MainThread:988218 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 06:24:56,892 INFO MainThread:988218 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 06:24:57,001 INFO MainThread:988218 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 06:24:57,001 INFO MainThread:988218 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 06:24:57,002 INFO MainThread:988218 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 06:24:57,002 INFO MainThread:988218 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 06:24:57,005 INFO MainThread:988218 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 06:24:57,006 INFO MainThread:988218 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_9f_4bs', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 8, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} +2025-12-08 06:26:48,949 INFO wandb-AsyncioManager-main:988218 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-08 06:26:48,949 INFO wandb-AsyncioManager-main:988218 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251208_062456-qz69oyz3/run-qz69oyz3.wandb b/Meissonic/wandb/run-20251208_062456-qz69oyz3/run-qz69oyz3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4e7dddf4b7353ea315af6e9868878bc05f583cdf Binary files /dev/null and b/Meissonic/wandb/run-20251208_062456-qz69oyz3/run-qz69oyz3.wandb differ diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/files/output.log b/Meissonic/wandb/run-20251208_062741-qalkbn80/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f2670a600f41fa9a0b88eedef686a54cf3e84879 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/files/output.log @@ -0,0 +1,167 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 67.43it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 06:27:44 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5715.42it/s] +12/08/2025 06:27:46 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=4, W'=7 +12/08/2025 06:27:46 - INFO - __main__ - Theoretical dimensions: F'=0, H'=2, W'=3 +12/08/2025 06:27:46 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 06:27:46 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 06:28:03 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 06:28:03 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 06:28:05 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 06:28:07 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 06:28:14 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 06:28:14 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 06:28:14 - INFO - __main__ - Dataloader configuration: +12/08/2025 06:28:14 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 06:28:14 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 06:28:14 - INFO - __main__ - - persistent_workers: True +12/08/2025 06:28:14 - INFO - __main__ - - pin_memory: True +12/08/2025 06:28:14 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 06:28:30 - INFO - __main__ - ***** Running training ***** +12/08/2025 06:28:30 - INFO - __main__ - Num training steps = 10000 +12/08/2025 06:28:30 - INFO - __main__ - Instantaneous batch size per device = 8 +12/08/2025 06:28:30 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/08/2025 06:28:30 - INFO - __main__ - Gradient Accumulation steps = 1 +12/08/2025 06:29:11 - INFO - __main__ - Step: 10 Loss: 11.0969 LR: 0.000300 +12/08/2025 06:29:32 - INFO - __main__ - Step: 20 Loss: 11.0993 LR: 0.000300 +12/08/2025 06:29:52 - INFO - __main__ - Step: 30 Loss: 11.1012 LR: 0.000300 +12/08/2025 06:30:13 - INFO - __main__ - Step: 40 Loss: 11.0992 LR: 0.000300 +12/08/2025 06:30:34 - INFO - __main__ - Step: 50 Loss: 11.1045 LR: 0.000300 +12/08/2025 06:30:56 - INFO - __main__ - Step: 60 Loss: 11.0981 LR: 0.000300 +12/08/2025 06:31:16 - INFO - __main__ - Step: 70 Loss: 11.0999 LR: 0.000300 +12/08/2025 06:31:37 - INFO - __main__ - Step: 80 Loss: 11.0901 LR: 0.000300 +12/08/2025 06:31:59 - INFO - __main__ - Step: 90 Loss: 11.0786 LR: 0.000300 +12/08/2025 06:32:20 - INFO - __main__ - Step: 100 Loss: 11.0815 LR: 0.000300 +12/08/2025 06:32:41 - INFO - __main__ - Step: 110 Loss: 11.0670 LR: 0.000300 +12/08/2025 06:33:02 - INFO - __main__ - Step: 120 Loss: 11.0464 LR: 0.000300 +12/08/2025 06:33:23 - INFO - __main__ - Step: 130 Loss: 11.0319 LR: 0.000300 +12/08/2025 06:33:45 - INFO - __main__ - Step: 140 Loss: 11.0280 LR: 0.000300 +12/08/2025 06:34:06 - INFO - __main__ - Step: 150 Loss: 11.0030 LR: 0.000300 +12/08/2025 06:34:27 - INFO - __main__ - Step: 160 Loss: 10.9769 LR: 0.000300 +12/08/2025 06:34:49 - INFO - __main__ - Step: 170 Loss: 10.9461 LR: 0.000300 +12/08/2025 06:35:10 - INFO - __main__ - Step: 180 Loss: 10.9273 LR: 0.000300 +12/08/2025 06:35:31 - INFO - __main__ - Step: 190 Loss: 10.9210 LR: 0.000300 +12/08/2025 06:35:53 - INFO - __main__ - Step: 200 Loss: 10.8968 LR: 0.000300 +12/08/2025 06:36:15 - INFO - __main__ - Step: 210 Loss: 10.8888 LR: 0.000300 +12/08/2025 06:36:37 - INFO - __main__ - Step: 220 Loss: 10.8603 LR: 0.000300 +12/08/2025 06:36:58 - INFO - __main__ - Step: 230 Loss: 10.8578 LR: 0.000300 +12/08/2025 06:37:20 - INFO - __main__ - Step: 240 Loss: 10.8413 LR: 0.000300 +12/08/2025 06:37:42 - INFO - __main__ - Step: 250 Loss: 10.8050 LR: 0.000300 +12/08/2025 06:38:04 - INFO - __main__ - Step: 260 Loss: 10.8143 LR: 0.000300 +12/08/2025 06:38:25 - INFO - __main__ - Step: 270 Loss: 10.7800 LR: 0.000300 +12/08/2025 06:38:47 - INFO - __main__ - Step: 280 Loss: 10.7418 LR: 0.000300 +12/08/2025 06:39:08 - INFO - __main__ - Step: 290 Loss: 10.7516 LR: 0.000300 +12/08/2025 06:39:31 - INFO - __main__ - Step: 300 Loss: 10.7369 LR: 0.000300 +12/08/2025 06:39:52 - INFO - __main__ - Step: 310 Loss: 10.7122 LR: 0.000300 +12/08/2025 06:40:14 - INFO - __main__ - Step: 320 Loss: 10.7289 LR: 0.000300 +12/08/2025 06:40:36 - INFO - __main__ - Step: 330 Loss: 10.7455 LR: 0.000300 +12/08/2025 06:40:58 - INFO - __main__ - Step: 340 Loss: 10.7406 LR: 0.000300 +12/08/2025 06:41:20 - INFO - __main__ - Step: 350 Loss: 10.6698 LR: 0.000300 +12/08/2025 06:41:42 - INFO - __main__ - Step: 360 Loss: 10.6939 LR: 0.000300 +12/08/2025 06:42:04 - INFO - __main__ - Step: 370 Loss: 10.6785 LR: 0.000300 +12/08/2025 06:42:27 - INFO - __main__ - Step: 380 Loss: 10.6714 LR: 0.000300 +12/08/2025 06:42:48 - INFO - __main__ - Step: 390 Loss: 10.6647 LR: 0.000300 +12/08/2025 06:43:10 - INFO - __main__ - Step: 400 Loss: 10.6492 LR: 0.000300 +12/08/2025 06:43:32 - INFO - __main__ - Step: 410 Loss: 10.6488 LR: 0.000300 +12/08/2025 06:43:53 - INFO - __main__ - Step: 420 Loss: 10.6739 LR: 0.000300 +12/08/2025 06:44:15 - INFO - __main__ - Step: 430 Loss: 10.6344 LR: 0.000300 +12/08/2025 06:44:37 - INFO - __main__ - Step: 440 Loss: 10.6503 LR: 0.000300 +12/08/2025 06:44:59 - INFO - __main__ - Step: 450 Loss: 10.6358 LR: 0.000300 +12/08/2025 06:45:22 - INFO - __main__ - Step: 460 Loss: 10.6386 LR: 0.000300 +12/08/2025 06:45:44 - INFO - __main__ - Step: 470 Loss: 10.6178 LR: 0.000300 +12/08/2025 06:46:06 - INFO - __main__ - Step: 480 Loss: 10.6220 LR: 0.000300 +12/08/2025 06:46:28 - INFO - __main__ - Step: 490 Loss: 10.6692 LR: 0.000300 +12/08/2025 06:46:50 - INFO - __main__ - Step: 500 Loss: 10.6238 LR: 0.000300 +12/08/2025 06:46:50 - INFO - accelerate.accelerator - Saving current state to output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500 +12/08/2025 06:47:00 - INFO - accelerate.checkpointing - Optimizer state saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/optimizer.bin +12/08/2025 06:47:00 - INFO - accelerate.checkpointing - Scheduler state saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/scheduler.bin +12/08/2025 06:47:00 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/sampler.bin +12/08/2025 06:47:00 - INFO - accelerate.checkpointing - Random states saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl +12/08/2025 06:47:00 - INFO - __main__ - Saved state to output_32x56_4f_8bs_4*8*8vqvae/checkpoint-500 +12/08/2025 06:47:00 - INFO - __main__ - Generating videos for validation... +12/08/2025 06:47:00 - INFO - __main__ - Generating videos for validation... +12/08/2025 06:47:00 - ERROR - __main__ - Video validation failed: num_frames (4) must be divisible by temporal downsampling factor (8) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1278, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 541, in __call__ + raise ValueError( +ValueError: num_frames (4) must be divisible by temporal downsampling factor (8) +12/08/2025 06:47:22 - INFO - __main__ - Step: 510 Loss: 10.5937 LR: 0.000300 +12/08/2025 06:47:43 - INFO - __main__ - Step: 520 Loss: 10.5638 LR: 0.000300 +12/08/2025 06:48:06 - INFO - __main__ - Step: 530 Loss: 10.6022 LR: 0.000300 +12/08/2025 06:48:28 - INFO - __main__ - Step: 540 Loss: 10.6146 LR: 0.000300 +12/08/2025 06:48:50 - INFO - __main__ - Step: 550 Loss: 10.6357 LR: 0.000300 +12/08/2025 06:49:12 - INFO - __main__ - Step: 560 Loss: 10.5949 LR: 0.000300 +12/08/2025 06:49:35 - INFO - __main__ - Step: 570 Loss: 10.6044 LR: 0.000300 +12/08/2025 06:49:57 - INFO - __main__ - Step: 580 Loss: 10.6040 LR: 0.000300 +12/08/2025 06:50:19 - INFO - __main__ - Step: 590 Loss: 10.5649 LR: 0.000300 +12/08/2025 06:50:41 - INFO - __main__ - Step: 600 Loss: 10.5936 LR: 0.000300 +12/08/2025 06:51:03 - INFO - __main__ - Step: 610 Loss: 10.6108 LR: 0.000300 +12/08/2025 06:51:25 - INFO - __main__ - Step: 620 Loss: 10.5532 LR: 0.000300 +12/08/2025 06:51:47 - INFO - __main__ - Step: 630 Loss: 10.5550 LR: 0.000300 +12/08/2025 06:52:09 - INFO - __main__ - Step: 640 Loss: 10.6156 LR: 0.000300 +12/08/2025 06:52:30 - INFO - __main__ - Step: 650 Loss: 10.6020 LR: 0.000300 +12/08/2025 06:53:25 - INFO - __main__ - Step: 660 Loss: 10.5624 LR: 0.000300 +12/08/2025 06:53:47 - INFO - __main__ - Step: 670 Loss: 10.5785 LR: 0.000300 +12/08/2025 06:54:22 - INFO - __main__ - Step: 680 Loss: 10.5761 LR: 0.000300 +12/08/2025 06:54:47 - INFO - __main__ - Step: 690 Loss: 10.5741 LR: 0.000300 +12/08/2025 06:55:31 - INFO - __main__ - Step: 700 Loss: 10.5785 LR: 0.000300 +12/08/2025 06:55:54 - INFO - __main__ - Step: 710 Loss: 10.5945 LR: 0.000300 +12/08/2025 06:56:43 - INFO - __main__ - Step: 720 Loss: 10.5611 LR: 0.000300 +12/08/2025 06:57:18 - INFO - __main__ - Step: 730 Loss: 10.5659 LR: 0.000300 +12/08/2025 06:57:39 - INFO - __main__ - Step: 740 Loss: 10.5558 LR: 0.000300 +12/08/2025 06:58:11 - INFO - __main__ - Step: 750 Loss: 10.5648 LR: 0.000300 +12/08/2025 06:58:43 - INFO - __main__ - Step: 760 Loss: 10.5705 LR: 0.000300 +12/08/2025 06:59:28 - INFO - __main__ - Step: 770 Loss: 10.5315 LR: 0.000300 +12/08/2025 06:59:50 - INFO - __main__ - Step: 780 Loss: 10.5513 LR: 0.000300 +12/08/2025 07:00:39 - INFO - __main__ - Step: 790 Loss: 10.5674 LR: 0.000300 +12/08/2025 07:01:00 - INFO - __main__ - Step: 800 Loss: 10.5174 LR: 0.000300 +12/08/2025 07:01:39 - INFO - __main__ - Step: 810 Loss: 10.5911 LR: 0.000300 +12/08/2025 07:02:02 - INFO - __main__ - Step: 820 Loss: 10.5389 LR: 0.000300 +12/08/2025 07:02:52 - INFO - __main__ - Step: 830 Loss: 10.5487 LR: 0.000300 +12/08/2025 07:03:13 - INFO - __main__ - Step: 840 Loss: 10.5537 LR: 0.000300 +12/08/2025 07:03:48 - INFO - __main__ - Step: 850 Loss: 10.5564 LR: 0.000300 +12/08/2025 07:04:21 - INFO - __main__ - Step: 860 Loss: 10.5350 LR: 0.000300 +12/08/2025 07:04:52 - INFO - __main__ - Step: 870 Loss: 10.5221 LR: 0.000300 +12/08/2025 07:05:35 - INFO - __main__ - Step: 880 Loss: 10.4884 LR: 0.000300 +12/08/2025 07:06:01 - INFO - __main__ - Step: 890 Loss: 10.5384 LR: 0.000300 +12/08/2025 07:06:45 - INFO - __main__ - Step: 900 Loss: 10.5458 LR: 0.000300 +12/08/2025 07:07:06 - INFO - __main__ - Step: 910 Loss: 10.5227 LR: 0.000300 +12/08/2025 07:08:01 - INFO - __main__ - Step: 920 Loss: 10.5706 LR: 0.000300 +12/08/2025 07:08:21 - INFO - __main__ - Step: 930 Loss: 10.5531 LR: 0.000300 +12/08/2025 07:08:43 - INFO - __main__ - Step: 940 Loss: 10.5201 LR: 0.000300 +12/08/2025 07:09:33 - INFO - __main__ - Step: 950 Loss: 10.5325 LR: 0.000300 +12/08/2025 07:09:59 - INFO - __main__ - Step: 960 Loss: 10.5191 LR: 0.000300 +12/08/2025 07:10:42 - INFO - __main__ - Step: 970 Loss: 10.5592 LR: 0.000300 +12/08/2025 07:11:12 - INFO - __main__ - Step: 980 Loss: 10.5364 LR: 0.000300 +12/08/2025 07:11:56 - INFO - __main__ - Step: 990 Loss: 10.5596 LR: 0.000300 +12/08/2025 07:12:18 - INFO - __main__ - Step: 1000 Loss: 10.4784 LR: 0.000300 +12/08/2025 07:12:18 - INFO - accelerate.accelerator - Saving current state to output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000 +12/08/2025 07:12:27 - INFO - accelerate.checkpointing - Optimizer state saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/optimizer.bin +12/08/2025 07:12:27 - INFO - accelerate.checkpointing - Scheduler state saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/scheduler.bin +12/08/2025 07:12:27 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/sampler.bin +12/08/2025 07:12:27 - INFO - accelerate.checkpointing - Random states saved in output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl +12/08/2025 07:12:27 - INFO - __main__ - Saved state to output_32x56_4f_8bs_4*8*8vqvae/checkpoint-1000 +12/08/2025 07:12:27 - INFO - __main__ - Generating videos for validation... +12/08/2025 07:12:27 - INFO - __main__ - Generating videos for validation... +12/08/2025 07:12:27 - ERROR - __main__ - Video validation failed: num_frames (4) must be divisible by temporal downsampling factor (8) +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1278, in main + videos = pipe( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + File "/mnt/Meissonic/src/pipeline_video.py", line 541, in __call__ + # raise ValueError( +ValueError: num_frames (4) must be divisible by temporal downsampling factor (8) +12/08/2025 07:13:05 - INFO - __main__ - Step: 1010 Loss: 10.5068 LR: 0.000300 +12/08/2025 07:13:26 - INFO - __main__ - Step: 1020 Loss: 10.5221 LR: 0.000300 +12/08/2025 07:14:23 - INFO - __main__ - Step: 1030 Loss: 10.5297 LR: 0.000300 +12/08/2025 07:14:44 - INFO - __main__ - Step: 1040 Loss: 10.5535 LR: 0.000300 +12/08/2025 07:15:22 - INFO - __main__ - Step: 1050 Loss: 10.4940 LR: 0.000300 +12/08/2025 07:16:09 - INFO - __main__ - Step: 1060 Loss: 10.5193 LR: 0.000300 +12/08/2025 07:16:29 - INFO - __main__ - Step: 1070 Loss: 10.5210 LR: 0.000300 +12/08/2025 07:17:11 - INFO - __main__ - Step: 1080 Loss: 10.5040 LR: 0.000300 diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/files/requirements.txt b/Meissonic/wandb/run-20251208_062741-qalkbn80/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_062741-qalkbn80/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f0ec402a259213b8efe30829beccb256a9922e76 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T06:27:41.588262Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "4", + "--video_height", + "32", + "--video_width", + "56", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_32x56_4f_8bs_4*8*8vqvae", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11819721035776" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "k0gn0kc3ownpjwqi8jnk80jh4wuclza1" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-core.log b/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2e34670a5d26421792ba54893afade248a1fb4b9 --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-08T06:27:41.660179644Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpwsrt0puo/port-1006985.txt","pid":1006985,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T06:27:41.660621135Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1006985} +{"time":"2025-12-08T06:27:41.660633944Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1006985-1007245-2704984213/socket","Net":"unix"}} +{"time":"2025-12-08T06:27:41.847376456Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T06:27:41.853178851Z","level":"INFO","msg":"handleInformInit: received","streamId":"qalkbn80","id":"1(@)"} +{"time":"2025-12-08T06:27:42.023589568Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"qalkbn80","id":"1(@)"} +{"time":"2025-12-08T07:17:14.111168672Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-internal.log b/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b82da0c7d7cd4bdf482bdc98a1decdc61ffa65fc --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-08T06:27:41.853277789Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T06:27:42.023420232Z","level":"INFO","msg":"stream: created new stream","id":"qalkbn80"} +{"time":"2025-12-08T06:27:42.023501295Z","level":"INFO","msg":"handler: started","stream_id":"qalkbn80"} +{"time":"2025-12-08T06:27:42.023581598Z","level":"INFO","msg":"stream: started","id":"qalkbn80"} +{"time":"2025-12-08T06:27:42.023599784Z","level":"INFO","msg":"sender: started","stream_id":"qalkbn80"} +{"time":"2025-12-08T06:27:42.023603396Z","level":"INFO","msg":"writer: started","stream_id":"qalkbn80"} diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug.log b/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..28abeefcf99357fec2c4d10f8b2646a4bf87ac3d --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_setup.py:_flush():80] Configure stats pid to 1006985 +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug.log +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_062741-qalkbn80/logs/debug-internal.log +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_init.py:init():841] calling init triggers +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 06:27:41,591 INFO MainThread:1006985 [wandb_init.py:init():889] starting backend +2025-12-08 06:27:41,847 INFO MainThread:1006985 [wandb_init.py:init():892] sending inform_init request +2025-12-08 06:27:41,851 INFO MainThread:1006985 [wandb_init.py:init():900] backend started and connected +2025-12-08 06:27:41,852 INFO MainThread:1006985 [wandb_init.py:init():970] updated telemetry +2025-12-08 06:27:41,857 INFO MainThread:1006985 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 06:27:42,221 INFO MainThread:1006985 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 06:27:42,330 INFO MainThread:1006985 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 06:27:42,330 INFO MainThread:1006985 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 06:27:42,330 INFO MainThread:1006985 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 06:27:42,330 INFO MainThread:1006985 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 06:27:42,333 INFO MainThread:1006985 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 06:27:42,334 INFO MainThread:1006985 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_32x56_4f_8bs_4*8*8vqvae', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 32, 'video_width': 56, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251208_062741-qalkbn80/run-qalkbn80.wandb b/Meissonic/wandb/run-20251208_062741-qalkbn80/run-qalkbn80.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30c19bb3b99bad76cdcd6557e4e665bc32502fbf --- /dev/null +++ b/Meissonic/wandb/run-20251208_062741-qalkbn80/run-qalkbn80.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36015ce0a4a235532e09c78c419a4c7f2c496cb34e5c654be87efd8db78d2f9e +size 753664 diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1000_5fba5b3b9257a6ea4286.png b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1000_5fba5b3b9257a6ea4286.png new file mode 100644 index 0000000000000000000000000000000000000000..ec5038cf126880965951c4cd6b3c7e782541a01d Binary files /dev/null and b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1000_5fba5b3b9257a6ea4286.png differ diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1000_76389031b22f5747f9d1.png b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1000_76389031b22f5747f9d1.png new file mode 100644 index 0000000000000000000000000000000000000000..696a127bb2b8baf83f042ca6a79e2c8847c12b4c Binary files /dev/null and b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1000_76389031b22f5747f9d1.png differ diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1500_53281bd82574163ad2ab.png b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1500_53281bd82574163ad2ab.png new file mode 100644 index 0000000000000000000000000000000000000000..d62bca398a77b34ae3370abb06c9a589c619e8d1 Binary files /dev/null and b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1500_53281bd82574163ad2ab.png differ diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1500_9d4f74d0f61cf2b121fe.png b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1500_9d4f74d0f61cf2b121fe.png new file mode 100644 index 0000000000000000000000000000000000000000..9d515824cd840ec4cc777ea421db55149f10f9e2 Binary files /dev/null and b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_1500_9d4f74d0f61cf2b121fe.png differ diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_500_884f228ba161275a3639.png b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_500_884f228ba161275a3639.png new file mode 100644 index 0000000000000000000000000000000000000000..c949d7f93efb8581eb380eaecaaa4cb13b5e9a64 Binary files /dev/null and b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_500_884f228ba161275a3639.png differ diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_500_a607f990a79bec5f1066.png b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_500_a607f990a79bec5f1066.png new file mode 100644 index 0000000000000000000000000000000000000000..df56735949c1ee4dd76678a022b0c2fff4428dce Binary files /dev/null and b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/media/images/generated_videos_first_frame_500_a607f990a79bec5f1066.png differ diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/output.log b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d3290a9f72e5ce4d08a5f97e74e0d7fb158eba01 --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/output.log @@ -0,0 +1,359 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 68.78it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 07:18:26 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3671.39it/s] +12/08/2025 07:18:28 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=16, W'=28 +12/08/2025 07:18:28 - INFO - __main__ - Theoretical dimensions: F'=1, H'=16, W'=28 +12/08/2025 07:18:28 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 07:18:28 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 07:18:43 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 07:18:43 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 07:18:45 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 07:18:47 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 07:18:56 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 07:18:56 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 07:18:56 - INFO - __main__ - Dataloader configuration: +12/08/2025 07:18:56 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 07:18:56 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 07:18:56 - INFO - __main__ - - persistent_workers: True +12/08/2025 07:18:56 - INFO - __main__ - - pin_memory: True +12/08/2025 07:18:56 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 07:19:12 - INFO - __main__ - ***** Running training ***** +12/08/2025 07:19:12 - INFO - __main__ - Num training steps = 10000 +12/08/2025 07:19:12 - INFO - __main__ - Instantaneous batch size per device = 2 +12/08/2025 07:19:12 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 16 +12/08/2025 07:19:12 - INFO - __main__ - Gradient Accumulation steps = 1 +12/08/2025 07:19:41 - INFO - __main__ - Step: 10 Loss: 11.0791 LR: 0.000300 +12/08/2025 07:19:51 - INFO - __main__ - Step: 20 Loss: 11.0750 LR: 0.000300 +12/08/2025 07:20:01 - INFO - __main__ - Step: 30 Loss: 11.0739 LR: 0.000300 +12/08/2025 07:20:12 - INFO - __main__ - Step: 40 Loss: 11.0735 LR: 0.000300 +12/08/2025 07:20:22 - INFO - __main__ - Step: 50 Loss: 11.0703 LR: 0.000300 +12/08/2025 07:20:33 - INFO - __main__ - Step: 60 Loss: 11.0699 LR: 0.000300 +12/08/2025 07:20:43 - INFO - __main__ - Step: 70 Loss: 11.0723 LR: 0.000300 +12/08/2025 07:20:54 - INFO - __main__ - Step: 80 Loss: 11.0692 LR: 0.000300 +12/08/2025 07:21:04 - INFO - __main__ - Step: 90 Loss: 11.0679 LR: 0.000300 +12/08/2025 07:21:14 - INFO - __main__ - Step: 100 Loss: 11.0647 LR: 0.000300 +12/08/2025 07:21:25 - INFO - __main__ - Step: 110 Loss: 11.0511 LR: 0.000300 +12/08/2025 07:21:36 - INFO - __main__ - Step: 120 Loss: 11.0460 LR: 0.000300 +12/08/2025 07:21:46 - INFO - __main__ - Step: 130 Loss: 11.0441 LR: 0.000300 +12/08/2025 07:21:57 - INFO - __main__ - Step: 140 Loss: 11.0292 LR: 0.000300 +12/08/2025 07:22:07 - INFO - __main__ - Step: 150 Loss: 11.0091 LR: 0.000300 +12/08/2025 07:22:16 - INFO - __main__ - Step: 160 Loss: 10.9959 LR: 0.000300 +12/08/2025 07:22:27 - INFO - __main__ - Step: 170 Loss: 10.9785 LR: 0.000300 +12/08/2025 07:22:37 - INFO - __main__ - Step: 180 Loss: 10.9652 LR: 0.000300 +12/08/2025 07:22:48 - INFO - __main__ - Step: 190 Loss: 10.9404 LR: 0.000300 +12/08/2025 07:22:58 - INFO - __main__ - Step: 200 Loss: 10.9111 LR: 0.000300 +12/08/2025 07:23:09 - INFO - __main__ - Step: 210 Loss: 10.8844 LR: 0.000300 +12/08/2025 07:23:19 - INFO - __main__ - Step: 220 Loss: 10.8804 LR: 0.000300 +12/08/2025 07:23:29 - INFO - __main__ - Step: 230 Loss: 10.8483 LR: 0.000300 +12/08/2025 07:23:39 - INFO - __main__ - Step: 240 Loss: 10.8448 LR: 0.000300 +12/08/2025 07:23:50 - INFO - __main__ - Step: 250 Loss: 10.8387 LR: 0.000300 +12/08/2025 07:24:01 - INFO - __main__ - Step: 260 Loss: 10.8275 LR: 0.000300 +12/08/2025 07:24:11 - INFO - __main__ - Step: 270 Loss: 10.7833 LR: 0.000300 +12/08/2025 07:24:22 - INFO - __main__ - Step: 280 Loss: 10.8187 LR: 0.000300 +12/08/2025 07:24:33 - INFO - __main__ - Step: 290 Loss: 10.7622 LR: 0.000300 +12/08/2025 07:24:43 - INFO - __main__ - Step: 300 Loss: 10.7791 LR: 0.000300 +12/08/2025 07:24:54 - INFO - __main__ - Step: 310 Loss: 10.7556 LR: 0.000300 +12/08/2025 07:25:04 - INFO - __main__ - Step: 320 Loss: 10.7484 LR: 0.000300 +12/08/2025 07:25:14 - INFO - __main__ - Step: 330 Loss: 10.7761 LR: 0.000300 +12/08/2025 07:25:24 - INFO - __main__ - Step: 340 Loss: 10.7330 LR: 0.000300 +12/08/2025 07:25:35 - INFO - __main__ - Step: 350 Loss: 10.7359 LR: 0.000300 +12/08/2025 07:25:45 - INFO - __main__ - Step: 360 Loss: 10.7433 LR: 0.000300 +12/08/2025 07:25:55 - INFO - __main__ - Step: 370 Loss: 10.7281 LR: 0.000300 +12/08/2025 07:26:06 - INFO - __main__ - Step: 380 Loss: 10.7423 LR: 0.000300 +12/08/2025 07:26:16 - INFO - __main__ - Step: 390 Loss: 10.7052 LR: 0.000300 +12/08/2025 07:26:26 - INFO - __main__ - Step: 400 Loss: 10.6671 LR: 0.000300 +12/08/2025 07:26:37 - INFO - __main__ - Step: 410 Loss: 10.7009 LR: 0.000300 +12/08/2025 07:26:47 - INFO - __main__ - Step: 420 Loss: 10.6547 LR: 0.000300 +12/08/2025 07:26:57 - INFO - __main__ - Step: 430 Loss: 10.6868 LR: 0.000300 +12/08/2025 07:27:08 - INFO - __main__ - Step: 440 Loss: 10.6214 LR: 0.000300 +12/08/2025 07:27:18 - INFO - __main__ - Step: 450 Loss: 10.6331 LR: 0.000300 +12/08/2025 07:27:28 - INFO - __main__ - Step: 460 Loss: 10.6731 LR: 0.000300 +12/08/2025 07:27:39 - INFO - __main__ - Step: 470 Loss: 10.6490 LR: 0.000300 +12/08/2025 07:27:50 - INFO - __main__ - Step: 480 Loss: 10.6500 LR: 0.000300 +12/08/2025 07:28:00 - INFO - __main__ - Step: 490 Loss: 10.6372 LR: 0.000300 +12/08/2025 07:28:10 - INFO - __main__ - Step: 500 Loss: 10.5730 LR: 0.000300 +12/08/2025 07:28:10 - INFO - accelerate.accelerator - Saving current state to output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500 +12/08/2025 07:28:18 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/optimizer.bin +12/08/2025 07:28:18 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/scheduler.bin +12/08/2025 07:28:18 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/sampler.bin +12/08/2025 07:28:18 - INFO - accelerate.checkpointing - Random states saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500/random_states_0.pkl +12/08/2025 07:28:18 - INFO - __main__ - Saved state to output_128x224_4f_2bs_4*8*8vqvae/checkpoint-500 +12/08/2025 07:28:18 - INFO - __main__ - Generating videos for validation... +12/08/2025 07:28:18 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.28it/s] +12/08/2025 07:28:25 - INFO - __main__ - Validation videos saved to ./output_128x224_4f_2bs_4*8*8vqvae +12/08/2025 07:28:35 - INFO - __main__ - Step: 510 Loss: 10.6406 LR: 0.000300 +12/08/2025 07:28:46 - INFO - __main__ - Step: 520 Loss: 10.6642 LR: 0.000300 +12/08/2025 07:28:56 - INFO - __main__ - Step: 530 Loss: 10.6562 LR: 0.000300 +12/08/2025 07:29:07 - INFO - __main__ - Step: 540 Loss: 10.6532 LR: 0.000300 +12/08/2025 07:29:18 - INFO - __main__ - Step: 550 Loss: 10.6268 LR: 0.000300 +12/08/2025 07:29:28 - INFO - __main__ - Step: 560 Loss: 10.5279 LR: 0.000300 +12/08/2025 07:29:39 - INFO - __main__ - Step: 570 Loss: 10.6712 LR: 0.000300 +12/08/2025 07:29:49 - INFO - __main__ - Step: 580 Loss: 10.6163 LR: 0.000300 +12/08/2025 07:30:00 - INFO - __main__ - Step: 590 Loss: 10.5797 LR: 0.000300 +12/08/2025 07:30:12 - INFO - __main__ - Step: 600 Loss: 10.5672 LR: 0.000300 +12/08/2025 07:30:23 - INFO - __main__ - Step: 610 Loss: 10.6494 LR: 0.000300 +12/08/2025 07:30:35 - INFO - __main__ - Step: 620 Loss: 10.5624 LR: 0.000300 +12/08/2025 07:30:45 - INFO - __main__ - Step: 630 Loss: 10.6043 LR: 0.000300 +12/08/2025 07:30:55 - INFO - __main__ - Step: 640 Loss: 10.5373 LR: 0.000300 +12/08/2025 07:31:06 - INFO - __main__ - Step: 650 Loss: 10.5340 LR: 0.000300 +12/08/2025 07:31:17 - INFO - __main__ - Step: 660 Loss: 10.5840 LR: 0.000300 +12/08/2025 07:31:28 - INFO - __main__ - Step: 670 Loss: 10.5831 LR: 0.000300 +12/08/2025 07:31:39 - INFO - __main__ - Step: 680 Loss: 10.6162 LR: 0.000300 +12/08/2025 07:31:49 - INFO - __main__ - Step: 690 Loss: 10.5944 LR: 0.000300 +12/08/2025 07:31:59 - INFO - __main__ - Step: 700 Loss: 10.5457 LR: 0.000300 +12/08/2025 07:32:10 - INFO - __main__ - Step: 710 Loss: 10.5504 LR: 0.000300 +12/08/2025 07:32:21 - INFO - __main__ - Step: 720 Loss: 10.5841 LR: 0.000300 +12/08/2025 07:32:31 - INFO - __main__ - Step: 730 Loss: 10.5708 LR: 0.000300 +12/08/2025 07:32:42 - INFO - __main__ - Step: 740 Loss: 10.5878 LR: 0.000300 +12/08/2025 07:32:53 - INFO - __main__ - Step: 750 Loss: 10.4668 LR: 0.000300 +12/08/2025 07:33:04 - INFO - __main__ - Step: 760 Loss: 10.5180 LR: 0.000300 +12/08/2025 07:33:14 - INFO - __main__ - Step: 770 Loss: 10.5158 LR: 0.000300 +12/08/2025 07:33:25 - INFO - __main__ - Step: 780 Loss: 10.5711 LR: 0.000300 +12/08/2025 07:33:35 - INFO - __main__ - Step: 790 Loss: 10.4961 LR: 0.000300 +12/08/2025 07:33:47 - INFO - __main__ - Step: 800 Loss: 10.5915 LR: 0.000300 +12/08/2025 07:33:58 - INFO - __main__ - Step: 810 Loss: 10.5373 LR: 0.000300 +12/08/2025 07:34:10 - INFO - __main__ - Step: 820 Loss: 10.4458 LR: 0.000300 +12/08/2025 07:34:20 - INFO - __main__ - Step: 830 Loss: 10.5727 LR: 0.000300 +12/08/2025 07:34:30 - INFO - __main__ - Step: 840 Loss: 10.5203 LR: 0.000300 +12/08/2025 07:34:41 - INFO - __main__ - Step: 850 Loss: 10.5065 LR: 0.000300 +12/08/2025 07:34:53 - INFO - __main__ - Step: 860 Loss: 10.5346 LR: 0.000300 +12/08/2025 07:35:05 - INFO - __main__ - Step: 870 Loss: 10.4913 LR: 0.000300 +12/08/2025 07:35:16 - INFO - __main__ - Step: 880 Loss: 10.4633 LR: 0.000300 +12/08/2025 07:35:27 - INFO - __main__ - Step: 890 Loss: 10.5258 LR: 0.000300 +12/08/2025 07:35:37 - INFO - __main__ - Step: 900 Loss: 10.4886 LR: 0.000300 +12/08/2025 07:35:47 - INFO - __main__ - Step: 910 Loss: 10.4839 LR: 0.000300 +12/08/2025 07:35:58 - INFO - __main__ - Step: 920 Loss: 10.5210 LR: 0.000300 +12/08/2025 07:36:10 - INFO - __main__ - Step: 930 Loss: 10.4808 LR: 0.000300 +12/08/2025 07:36:21 - INFO - __main__ - Step: 940 Loss: 10.4740 LR: 0.000300 +12/08/2025 07:36:32 - INFO - __main__ - Step: 950 Loss: 10.5103 LR: 0.000300 +12/08/2025 07:36:43 - INFO - __main__ - Step: 960 Loss: 10.5238 LR: 0.000300 +12/08/2025 07:36:54 - INFO - __main__ - Step: 970 Loss: 10.5924 LR: 0.000300 +12/08/2025 07:37:04 - INFO - __main__ - Step: 980 Loss: 10.5212 LR: 0.000300 +12/08/2025 07:37:16 - INFO - __main__ - Step: 990 Loss: 10.4879 LR: 0.000300 +12/08/2025 07:37:27 - INFO - __main__ - Step: 1000 Loss: 10.6040 LR: 0.000300 +12/08/2025 07:37:27 - INFO - accelerate.accelerator - Saving current state to output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000 +12/08/2025 07:37:34 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/optimizer.bin +12/08/2025 07:37:34 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/scheduler.bin +12/08/2025 07:37:34 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/sampler.bin +12/08/2025 07:37:34 - INFO - accelerate.checkpointing - Random states saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000/random_states_0.pkl +12/08/2025 07:37:34 - INFO - __main__ - Saved state to output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1000 +12/08/2025 07:37:34 - INFO - __main__ - Generating videos for validation... +12/08/2025 07:37:34 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:07<00:00, 6.59it/s] +12/08/2025 07:37:43 - INFO - __main__ - Validation videos saved to ./output_128x224_4f_2bs_4*8*8vqvae +12/08/2025 07:37:55 - INFO - __main__ - Step: 1010 Loss: 10.5551 LR: 0.000300 +12/08/2025 07:38:06 - INFO - __main__ - Step: 1020 Loss: 10.5330 LR: 0.000300 +12/08/2025 07:38:18 - INFO - __main__ - Step: 1030 Loss: 10.5111 LR: 0.000300 +12/08/2025 07:38:29 - INFO - __main__ - Step: 1040 Loss: 10.4348 LR: 0.000300 +12/08/2025 07:38:41 - INFO - __main__ - Step: 1050 Loss: 10.4561 LR: 0.000300 +12/08/2025 07:38:52 - INFO - __main__ - Step: 1060 Loss: 10.5186 LR: 0.000300 +12/08/2025 07:39:03 - INFO - __main__ - Step: 1070 Loss: 10.5010 LR: 0.000300 +12/08/2025 07:39:15 - INFO - __main__ - Step: 1080 Loss: 10.4697 LR: 0.000300 +12/08/2025 07:39:26 - INFO - __main__ - Step: 1090 Loss: 10.4187 LR: 0.000300 +12/08/2025 07:39:38 - INFO - __main__ - Step: 1100 Loss: 10.4345 LR: 0.000300 +12/08/2025 07:39:50 - INFO - __main__ - Step: 1110 Loss: 10.4531 LR: 0.000300 +12/08/2025 07:40:01 - INFO - __main__ - Step: 1120 Loss: 10.4995 LR: 0.000300 +12/08/2025 07:40:12 - INFO - __main__ - Step: 1130 Loss: 10.5177 LR: 0.000300 +12/08/2025 07:40:23 - INFO - __main__ - Step: 1140 Loss: 10.4628 LR: 0.000300 +12/08/2025 07:40:34 - INFO - __main__ - Step: 1150 Loss: 10.5039 LR: 0.000300 +12/08/2025 07:40:45 - INFO - __main__ - Step: 1160 Loss: 10.5378 LR: 0.000300 +12/08/2025 07:40:56 - INFO - __main__ - Step: 1170 Loss: 10.4866 LR: 0.000300 +12/08/2025 07:41:07 - INFO - __main__ - Step: 1180 Loss: 10.4979 LR: 0.000300 +12/08/2025 07:41:19 - INFO - __main__ - Step: 1190 Loss: 10.4341 LR: 0.000300 +12/08/2025 07:41:30 - INFO - __main__ - Step: 1200 Loss: 10.4345 LR: 0.000300 +12/08/2025 07:41:42 - INFO - __main__ - Step: 1210 Loss: 10.5064 LR: 0.000300 +12/08/2025 07:41:53 - INFO - __main__ - Step: 1220 Loss: 10.5068 LR: 0.000300 +12/08/2025 07:42:04 - INFO - __main__ - Step: 1230 Loss: 10.5067 LR: 0.000300 +12/08/2025 07:42:16 - INFO - __main__ - Step: 1240 Loss: 10.5162 LR: 0.000300 +12/08/2025 07:42:27 - INFO - __main__ - Step: 1250 Loss: 10.5313 LR: 0.000300 +12/08/2025 07:42:38 - INFO - __main__ - Step: 1260 Loss: 10.4982 LR: 0.000300 +12/08/2025 07:42:50 - INFO - __main__ - Step: 1270 Loss: 10.4725 LR: 0.000300 +12/08/2025 07:43:01 - INFO - __main__ - Step: 1280 Loss: 10.5044 LR: 0.000300 +12/08/2025 07:43:12 - INFO - __main__ - Step: 1290 Loss: 10.4312 LR: 0.000300 +12/08/2025 07:43:24 - INFO - __main__ - Step: 1300 Loss: 10.4405 LR: 0.000300 +12/08/2025 07:43:35 - INFO - __main__ - Step: 1310 Loss: 10.4837 LR: 0.000300 +12/08/2025 07:43:45 - INFO - __main__ - Step: 1320 Loss: 10.4423 LR: 0.000300 +12/08/2025 07:43:57 - INFO - __main__ - Step: 1330 Loss: 10.4131 LR: 0.000300 +12/08/2025 07:44:08 - INFO - __main__ - Step: 1340 Loss: 10.4227 LR: 0.000300 +12/08/2025 07:44:20 - INFO - __main__ - Step: 1350 Loss: 10.5178 LR: 0.000300 +12/08/2025 07:44:31 - INFO - __main__ - Step: 1360 Loss: 10.4928 LR: 0.000300 +12/08/2025 07:44:41 - INFO - __main__ - Step: 1370 Loss: 10.5283 LR: 0.000300 +12/08/2025 07:44:52 - INFO - __main__ - Step: 1380 Loss: 10.4598 LR: 0.000300 +12/08/2025 07:45:03 - INFO - __main__ - Step: 1390 Loss: 10.5008 LR: 0.000300 +12/08/2025 07:45:15 - INFO - __main__ - Step: 1400 Loss: 10.5150 LR: 0.000300 +12/08/2025 07:45:26 - INFO - __main__ - Step: 1410 Loss: 10.4277 LR: 0.000300 +12/08/2025 07:45:37 - INFO - __main__ - Step: 1420 Loss: 10.5060 LR: 0.000300 +12/08/2025 07:45:48 - INFO - __main__ - Step: 1430 Loss: 10.4265 LR: 0.000300 +12/08/2025 07:46:00 - INFO - __main__ - Step: 1440 Loss: 10.3760 LR: 0.000300 +12/08/2025 07:46:11 - INFO - __main__ - Step: 1450 Loss: 10.4575 LR: 0.000300 +12/08/2025 07:46:22 - INFO - __main__ - Step: 1460 Loss: 10.4462 LR: 0.000300 +12/08/2025 07:46:34 - INFO - __main__ - Step: 1470 Loss: 10.4534 LR: 0.000300 +12/08/2025 07:46:45 - INFO - __main__ - Step: 1480 Loss: 10.4444 LR: 0.000300 +12/08/2025 07:46:55 - INFO - __main__ - Step: 1490 Loss: 10.4124 LR: 0.000300 +12/08/2025 07:47:06 - INFO - __main__ - Step: 1500 Loss: 10.5423 LR: 0.000300 +12/08/2025 07:47:06 - INFO - accelerate.accelerator - Saving current state to output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500 +12/08/2025 07:47:15 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/optimizer.bin +12/08/2025 07:47:15 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/scheduler.bin +12/08/2025 07:47:15 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/sampler.bin +12/08/2025 07:47:15 - INFO - accelerate.checkpointing - Random states saved in output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500/random_states_0.pkl +12/08/2025 07:47:15 - INFO - __main__ - Saved state to output_128x224_4f_2bs_4*8*8vqvae/checkpoint-1500 +12/08/2025 07:47:15 - INFO - __main__ - Generating videos for validation... +12/08/2025 07:47:15 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.10it/s] +12/08/2025 07:47:21 - INFO - __main__ - Validation videos saved to ./output_128x224_4f_2bs_4*8*8vqvae +12/08/2025 07:47:32 - INFO - __main__ - Step: 1510 Loss: 10.4962 LR: 0.000300 +12/08/2025 07:47:44 - INFO - __main__ - Step: 1520 Loss: 10.4884 LR: 0.000300 +12/08/2025 07:47:55 - INFO - __main__ - Step: 1530 Loss: 10.4585 LR: 0.000300 +12/08/2025 07:48:07 - INFO - __main__ - Step: 1540 Loss: 10.5206 LR: 0.000300 +12/08/2025 07:48:18 - INFO - __main__ - Step: 1550 Loss: 10.5191 LR: 0.000300 +12/08/2025 07:48:29 - INFO - __main__ - Step: 1560 Loss: 10.4605 LR: 0.000300 +12/08/2025 07:48:41 - INFO - __main__ - Step: 1570 Loss: 10.4379 LR: 0.000300 +12/08/2025 07:48:52 - INFO - __main__ - Step: 1580 Loss: 10.3727 LR: 0.000300 +12/08/2025 07:49:04 - INFO - __main__ - Step: 1590 Loss: 10.4478 LR: 0.000300 +12/08/2025 07:49:15 - INFO - __main__ - Step: 1600 Loss: 10.4360 LR: 0.000300 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1359, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1171, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward + x = cross_attn_ffn(x, context, context_lens, e) + File "/mnt/Meissonic/src/transformer_video.py", line 477, in cross_attn_ffn + x = x + self.cross_attn(self.norm3(x), context, context_lens) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 388, in forward + q = self.norm_q(self.q(x)).view(b, -1, n, d) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 281, in forward + return self._norm(x.float()).type_as(x) * self.weight.type_as(x) + File "/mnt/Meissonic/src/transformer_video.py", line 284, in _norm + return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1129, in pack_hook + def pack_hook(x): +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1359, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1171, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward +[rank0]: x = cross_attn_ffn(x, context, context_lens, e) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 477, in cross_attn_ffn +[rank0]: x = x + self.cross_attn(self.norm3(x), context, context_lens) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 388, in forward +[rank0]: q = self.norm_q(self.q(x)).view(b, -1, n, d) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 281, in forward +[rank0]: return self._norm(x.float()).type_as(x) * self.weight.type_as(x) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 284, in _norm +[rank0]: return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1129, in pack_hook +[rank0]: def pack_hook(x): +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/requirements.txt b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a8c4148adaf6c6672260af35dbc4f306eaeb16c --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/requirements.txt @@ -0,0 +1,139 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +aiohttp==3.13.2 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +pandas==2.3.3 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..32bd4f42d87816ec7051f233779e9c11846b44d4 --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T07:18:23.137418Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--num_frames", + "4", + "--video_height", + "128", + "--video_width", + "224", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x224_4f_2bs_4*8*8vqvae", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11831191023616" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "455p1nc4ntuffrug5i0ormilp4qait21" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-core.log b/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..bfd08d20e5678a4258e6fec10ffdc657bf955cf8 --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-08T07:18:23.203099372Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpseyl58k4/port-2490558.txt","pid":2490558,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T07:18:23.20364499Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2490558} +{"time":"2025-12-08T07:18:23.203657229Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2490558-2490821-350546645/socket","Net":"unix"}} +{"time":"2025-12-08T07:18:23.390280295Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T07:18:23.396482982Z","level":"INFO","msg":"handleInformInit: received","streamId":"0hjx73rw","id":"1(@)"} +{"time":"2025-12-08T07:18:23.561433511Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"0hjx73rw","id":"1(@)"} +{"time":"2025-12-08T07:49:20.399918343Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-internal.log b/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..803928fa69963b998da035c419954c265e90bf30 --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-08T07:18:23.396700819Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T07:18:23.561211348Z","level":"INFO","msg":"stream: created new stream","id":"0hjx73rw"} +{"time":"2025-12-08T07:18:23.561299615Z","level":"INFO","msg":"handler: started","stream_id":"0hjx73rw"} +{"time":"2025-12-08T07:18:23.561424841Z","level":"INFO","msg":"stream: started","id":"0hjx73rw"} +{"time":"2025-12-08T07:18:23.561440104Z","level":"INFO","msg":"writer: started","stream_id":"0hjx73rw"} +{"time":"2025-12-08T07:18:23.561440819Z","level":"INFO","msg":"sender: started","stream_id":"0hjx73rw"} diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug.log b/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ab27e1c2a274b132415946312b79ea7294b323b7 --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_setup.py:_flush():80] Configure stats pid to 2490558 +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug.log +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_071823-0hjx73rw/logs/debug-internal.log +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_init.py:init():841] calling init triggers +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 07:18:23,140 INFO MainThread:2490558 [wandb_init.py:init():889] starting backend +2025-12-08 07:18:23,390 INFO MainThread:2490558 [wandb_init.py:init():892] sending inform_init request +2025-12-08 07:18:23,394 INFO MainThread:2490558 [wandb_init.py:init():900] backend started and connected +2025-12-08 07:18:23,396 INFO MainThread:2490558 [wandb_init.py:init():970] updated telemetry +2025-12-08 07:18:23,400 INFO MainThread:2490558 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 07:18:23,747 INFO MainThread:2490558 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 07:18:23,856 INFO MainThread:2490558 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 07:18:23,856 INFO MainThread:2490558 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 07:18:23,856 INFO MainThread:2490558 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 07:18:23,856 INFO MainThread:2490558 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 07:18:23,859 INFO MainThread:2490558 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 07:18:23,860 INFO MainThread:2490558 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x224_4f_2bs_4*8*8vqvae', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 128, 'video_width': 224, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B'} diff --git a/Meissonic/wandb/run-20251208_071823-0hjx73rw/run-0hjx73rw.wandb b/Meissonic/wandb/run-20251208_071823-0hjx73rw/run-0hjx73rw.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1bdc181f2cfaf9097e586aac8f4f61a1507374b2 --- /dev/null +++ b/Meissonic/wandb/run-20251208_071823-0hjx73rw/run-0hjx73rw.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e444d4e446935f8afc716a8dc9c6a719d6314cbc073ed86d425452e2cb0cccd +size 589824 diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/files/config.yaml b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8e9d7d65cbb5a1098dfdf3904cc051180267235 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/config.yaml @@ -0,0 +1,296 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 0vt3ubbl1cfi1nqfghobvagoflxcjvzc: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "2" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11900304912384" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-08T15:56:07.007993Z" + writerId: 0vt3ubbl1cfi1nqfghobvagoflxcjvzc + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 2 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 0.1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/files/output.log b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..cc0f9d565473a86e4fdcb6f24b6d0b3473233043 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/output.log @@ -0,0 +1,164 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 68.46it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 15:56:10 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4299.96it/s] +12/08/2025 15:56:12 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/08/2025 15:56:12 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/08/2025 15:56:12 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 15:56:12 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 15:56:27 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 15:56:27 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 15:56:29 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 15:56:31 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/08/2025 15:56:31 - INFO - __main__ - Wan backbone lr = 0.000030 (base_lr * 0.1) +12/08/2025 15:56:31 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/08/2025 15:56:31 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 15:56:38 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 15:56:38 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 15:56:38 - INFO - __main__ - Dataloader configuration: +12/08/2025 15:56:38 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 15:56:38 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 15:56:38 - INFO - __main__ - - persistent_workers: True +12/08/2025 15:56:38 - INFO - __main__ - - pin_memory: True +12/08/2025 15:56:38 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 15:56:57 - INFO - __main__ - ***** Running training ***** +12/08/2025 15:56:57 - INFO - __main__ - Num training steps = 10000 +12/08/2025 15:56:57 - INFO - __main__ - Instantaneous batch size per device = 2 +12/08/2025 15:56:57 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 16 +12/08/2025 15:56:57 - INFO - __main__ - Gradient Accumulation steps = 1 +12/08/2025 15:57:19 - INFO - __main__ - Step: 10 Loss: 11.0874 LR: 0.000030 +12/08/2025 15:57:30 - INFO - __main__ - Step: 20 Loss: 11.0766 LR: 0.000030 +12/08/2025 15:57:41 - INFO - __main__ - Step: 30 Loss: 11.0784 LR: 0.000030 +12/08/2025 15:57:52 - INFO - __main__ - Step: 40 Loss: 11.0731 LR: 0.000030 +12/08/2025 15:58:04 - INFO - __main__ - Step: 50 Loss: 11.0742 LR: 0.000030 +12/08/2025 15:58:15 - INFO - __main__ - Step: 60 Loss: 11.0691 LR: 0.000030 +12/08/2025 15:58:27 - INFO - __main__ - Step: 70 Loss: 11.0676 LR: 0.000030 +12/08/2025 15:58:38 - INFO - __main__ - Step: 80 Loss: 11.0666 LR: 0.000030 +12/08/2025 15:58:49 - INFO - __main__ - Step: 90 Loss: 11.0633 LR: 0.000030 +12/08/2025 15:59:00 - INFO - __main__ - Step: 100 Loss: 11.0617 LR: 0.000030 +12/08/2025 15:59:11 - INFO - __main__ - Step: 110 Loss: 11.0550 LR: 0.000030 +12/08/2025 15:59:23 - INFO - __main__ - Step: 120 Loss: 11.0503 LR: 0.000030 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1435, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1247, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 357, in forward + x = flash_attention( + File "/mnt/Meissonic/src/transformer_video.py", line 77, in flash_attention + q_lens = torch.tensor( +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1435, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1247, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 357, in forward +[rank0]: x = flash_attention( +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 77, in flash_attention +[rank0]: q_lens = torch.tensor( +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/files/requirements.txt b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9ca92f178ce0933595fef14b0bd95b3f51a2cf30 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T15:56:07.007993Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11900304912384" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "0vt3ubbl1cfi1nqfghobvagoflxcjvzc" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/files/wandb-summary.json b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad32f50c2245505a6cd51fa77e9b45c2c086edc --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/files/wandb-summary.json @@ -0,0 +1 @@ +{"avg_masking_rate":0.5684051513671875,"_timestamp":1.765209563009791e+09,"_step":120,"_runtime":197.270936192,"_wandb":{"runtime":197},"step_loss":11.050317764282227,"lr":2.9999999999999997e-05} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-core.log b/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..3f002bed3d617a55d4dbc55514cd568540a2a670 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-08T15:56:07.075882669Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpii1iaw4p/port-3410532.txt","pid":3410532,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T15:56:07.076335169Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3410532} +{"time":"2025-12-08T15:56:07.076347658Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3410532-3410856-3631560017/socket","Net":"unix"}} +{"time":"2025-12-08T15:56:07.262421503Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T15:56:07.268400421Z","level":"INFO","msg":"handleInformInit: received","streamId":"ja5993e9","id":"1(@)"} +{"time":"2025-12-08T15:56:07.431953902Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ja5993e9","id":"1(@)"} +{"time":"2025-12-08T15:59:24.939887729Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-08T15:59:24.939946483Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-08T15:59:24.93997108Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-08T15:59:24.940000281Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-08T15:59:24.94006594Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3410532-3410856-3631560017/socket","Net":"unix"}} +{"time":"2025-12-08T15:59:25.355246261Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-08T15:59:25.355288673Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-08T15:59:25.355306619Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-internal.log b/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d91ea0ded5a3a0bdfd584666f3ed5d1de86f4dd5 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-08T15:56:07.26854712Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T15:56:07.431715392Z","level":"INFO","msg":"stream: created new stream","id":"ja5993e9"} +{"time":"2025-12-08T15:56:07.43179556Z","level":"INFO","msg":"handler: started","stream_id":"ja5993e9"} +{"time":"2025-12-08T15:56:07.43194582Z","level":"INFO","msg":"stream: started","id":"ja5993e9"} +{"time":"2025-12-08T15:56:07.43196348Z","level":"INFO","msg":"writer: started","stream_id":"ja5993e9"} +{"time":"2025-12-08T15:56:07.43196722Z","level":"INFO","msg":"sender: started","stream_id":"ja5993e9"} +{"time":"2025-12-08T15:59:24.939960682Z","level":"INFO","msg":"stream: closing","id":"ja5993e9"} +{"time":"2025-12-08T15:59:25.193247831Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-08T15:59:25.351322144Z","level":"INFO","msg":"handler: closed","stream_id":"ja5993e9"} +{"time":"2025-12-08T15:59:25.351682568Z","level":"INFO","msg":"sender: closed","stream_id":"ja5993e9"} +{"time":"2025-12-08T15:59:25.351702093Z","level":"INFO","msg":"stream: closed","id":"ja5993e9"} diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug.log b/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..a9898d7551095a29733d3db45f5b68b7ccef3965 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_setup.py:_flush():80] Configure stats pid to 3410532 +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug.log +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_155607-ja5993e9/logs/debug-internal.log +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_init.py:init():841] calling init triggers +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 15:56:07,010 INFO MainThread:3410532 [wandb_init.py:init():889] starting backend +2025-12-08 15:56:07,262 INFO MainThread:3410532 [wandb_init.py:init():892] sending inform_init request +2025-12-08 15:56:07,266 INFO MainThread:3410532 [wandb_init.py:init():900] backend started and connected +2025-12-08 15:56:07,268 INFO MainThread:3410532 [wandb_init.py:init():970] updated telemetry +2025-12-08 15:56:07,272 INFO MainThread:3410532 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 15:56:07,668 INFO MainThread:3410532 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 15:56:07,791 INFO MainThread:3410532 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 15:56:07,791 INFO MainThread:3410532 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 15:56:07,792 INFO MainThread:3410532 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 15:56:07,792 INFO MainThread:3410532 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 15:56:07,794 INFO MainThread:3410532 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 15:56:07,795 INFO MainThread:3410532 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.1} +2025-12-08 15:59:24,940 INFO wandb-AsyncioManager-main:3410532 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-08 15:59:24,940 INFO wandb-AsyncioManager-main:3410532 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251208_155607-ja5993e9/run-ja5993e9.wandb b/Meissonic/wandb/run-20251208_155607-ja5993e9/run-ja5993e9.wandb new file mode 100644 index 0000000000000000000000000000000000000000..532536289e0b74179b6410f48d12106677fc38cb Binary files /dev/null and b/Meissonic/wandb/run-20251208_155607-ja5993e9/run-ja5993e9.wandb differ diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/config.yaml b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..99fdc429ea0225debb67b29c7ae42790fc323bcf --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/config.yaml @@ -0,0 +1,297 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + f0bt4siuu2gjzmv16hc4nme2mcfrsfsi: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "2" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "11900305113088" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-08T15:59:43.669474Z" + writerId: f0bt4siuu2gjzmv16hc4nme2mcfrsfsi + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 2 + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 2 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 0.1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_8328d2d0556a95ff2759.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_8328d2d0556a95ff2759.png new file mode 100644 index 0000000000000000000000000000000000000000..c2aa59ddfc654ec9f82d4c9abff61590cc837f39 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_8328d2d0556a95ff2759.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8328d2d0556a95ff2759ddf86ad4dcf2e293d935c74841fcc2a3b4a6bcafa2a5 +size 143482 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_980ee3261a5cf9cce942.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_980ee3261a5cf9cce942.png new file mode 100644 index 0000000000000000000000000000000000000000..1a5c6ceb734f615b46258dee56e70de2cef8ee42 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_980ee3261a5cf9cce942.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:980ee3261a5cf9cce942d12e4e4729bf254c67eef014ef3b6cf421a272ab480c +size 157765 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_8fd26361f0705a90a632.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_8fd26361f0705a90a632.png new file mode 100644 index 0000000000000000000000000000000000000000..d71a3c7a84b2a61722a420daeeb652249dd5c59f --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_8fd26361f0705a90a632.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd26361f0705a90a6329c8a9cd00e16eae894edbeb49608d76a9da33ccdb3f5 +size 171306 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_cec203cb5c36d2873217.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_cec203cb5c36d2873217.png new file mode 100644 index 0000000000000000000000000000000000000000..4c0a3241e815e1072e83a7c1930add5724c2e6e1 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_cec203cb5c36d2873217.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec203cb5c36d2873217dce75eb242633b0d4fd0aa6d1b1a5ae91203cc511aa7 +size 154952 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_c061c65a6ce343b1660e.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_c061c65a6ce343b1660e.png new file mode 100644 index 0000000000000000000000000000000000000000..bcc8d3c0fbb18c2aed90fb5ab881a79a74a90a6c --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_c061c65a6ce343b1660e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c061c65a6ce343b1660e7e716906c76632561aff9ccd459c06e2f77a6c6ce023 +size 151206 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_f047fb97b642dc30b33c.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_f047fb97b642dc30b33c.png new file mode 100644 index 0000000000000000000000000000000000000000..3be830fede518a8205d018f3422730b27bc68f08 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_f047fb97b642dc30b33c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f047fb97b642dc30b33c713788d30200b71afdcfb75e30c504fdf6c6a207f390 +size 163793 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_2805ac51dfa6ef4de083.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_2805ac51dfa6ef4de083.png new file mode 100644 index 0000000000000000000000000000000000000000..71284bdfb848395c693268f4a58f7c484b3ac992 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_2805ac51dfa6ef4de083.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2805ac51dfa6ef4de083f749ae468d53d2e04e1fe1f53862f0e92a1b77c77c0f +size 154531 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_e98ce360ce92d75f9a36.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_e98ce360ce92d75f9a36.png new file mode 100644 index 0000000000000000000000000000000000000000..96d8972d2c2014040d9269500a147c781820c929 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_e98ce360ce92d75f9a36.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e98ce360ce92d75f9a36beceaa6a8e65c74d6190aa53a74c43f7bd80ee9135e9 +size 160429 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_430592107b01c838d952.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_430592107b01c838d952.png new file mode 100644 index 0000000000000000000000000000000000000000..7b859d41e223d88cfcc30a0d94543156bd92097d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_430592107b01c838d952.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430592107b01c838d95230455d6237687f51489c736abc6f4a54a3a888b2d9d4 +size 151819 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_ecca4db815beca263f13.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_ecca4db815beca263f13.png new file mode 100644 index 0000000000000000000000000000000000000000..527bff166840b69be09c32f54b33a8afa7349421 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_ecca4db815beca263f13.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecca4db815beca263f13207354f4e3e1b9cc6944e4a79a1a08edaefebfa40c19 +size 164005 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_52422bc6ab7caedd5b8c.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_52422bc6ab7caedd5b8c.png new file mode 100644 index 0000000000000000000000000000000000000000..7804b818e67765833102006e8205c3f9a2753821 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_52422bc6ab7caedd5b8c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52422bc6ab7caedd5b8c29f1057d92a2c74d6c38fcd27ede46ebec43ff8cfe59 +size 106641 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_8e35b9b7d6b7a0806553.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_8e35b9b7d6b7a0806553.png new file mode 100644 index 0000000000000000000000000000000000000000..a8bbcdfcc00d28eb2ad200c7ca671522dc63446b --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_8e35b9b7d6b7a0806553.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e35b9b7d6b7a08065532353cfef040cacdaa06a696822639c2349c5ae229bbc +size 128321 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_227067a6cd64b7cdced4.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_227067a6cd64b7cdced4.png new file mode 100644 index 0000000000000000000000000000000000000000..73ca44051bb59bfad72ceb656c79c9743ff4b68c --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_227067a6cd64b7cdced4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:227067a6cd64b7cdced40c483f0da283047ad4d0df910d1d6deeeb57272326f4 +size 154964 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_55ba9221da0bf3c49190.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_55ba9221da0bf3c49190.png new file mode 100644 index 0000000000000000000000000000000000000000..a7ee239599090e349a38a61feeee19ca1e7d5600 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_55ba9221da0bf3c49190.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ba9221da0bf3c4919028ee3c41da6922b3b52403bdedcf340600bdbd33f512 +size 155417 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_9a50d3903fd31767c616.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_9a50d3903fd31767c616.png new file mode 100644 index 0000000000000000000000000000000000000000..c8652ab59f8567c725658b38071a10399cfcba7d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_9a50d3903fd31767c616.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a50d3903fd31767c61693491fe8690adbe9040c7f7f963a469cda72c36ac636 +size 116216 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_ffacfcca81b53cb27319.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_ffacfcca81b53cb27319.png new file mode 100644 index 0000000000000000000000000000000000000000..ea92180801509d5a83faf01f800be416c6a17141 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_ffacfcca81b53cb27319.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffacfcca81b53cb27319a5287eebacece7bb2bb70ecf2770cbf90c249834bf4b +size 137536 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_935711ba29b3ab613691.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_935711ba29b3ab613691.png new file mode 100644 index 0000000000000000000000000000000000000000..60077216d24148c8123a137ea44ec45536c6b8a9 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_935711ba29b3ab613691.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935711ba29b3ab6136917a68834005a2c21d880bbcb78d211043cfead73f3db1 +size 134205 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_bf885e1339d92cc386d1.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_bf885e1339d92cc386d1.png new file mode 100644 index 0000000000000000000000000000000000000000..8f68427d9557ffbe515f9e696aac833d8769bf7c --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_bf885e1339d92cc386d1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf885e1339d92cc386d12ff933a6b042a9f83815a2f473435cf3337ae3623e11 +size 152408 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_bdd3a8c8c0c8a7a7d4dd.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_bdd3a8c8c0c8a7a7d4dd.png new file mode 100644 index 0000000000000000000000000000000000000000..96007f6d41f66705495c18834570ebc055c0c490 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_bdd3a8c8c0c8a7a7d4dd.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd3a8c8c0c8a7a7d4dd9c5151f6f7649ed6527208d08f9f8255652e8b034f16 +size 123734 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_c8333d970fbc70e45c64.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_c8333d970fbc70e45c64.png new file mode 100644 index 0000000000000000000000000000000000000000..8e5214a6dc17030c736e893a0547d59bed4f452c --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_c8333d970fbc70e45c64.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8333d970fbc70e45c642cb23a3e7fd7438ecce32d0be670b3a387f953daecf1 +size 167486 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_3d483725c07baf8663d3.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_3d483725c07baf8663d3.png new file mode 100644 index 0000000000000000000000000000000000000000..00d6645819dfdd4a9693792021c6d5ff74a0ad29 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_3d483725c07baf8663d3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d483725c07baf8663d3b6db52b3269fd8031f2ebebafda06080754ba5683d7f +size 144108 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_b0f06ea56e9a9c08850c.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_b0f06ea56e9a9c08850c.png new file mode 100644 index 0000000000000000000000000000000000000000..b0df40b09ef1cda6b9cf9273f58db521b78572a8 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_b0f06ea56e9a9c08850c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f06ea56e9a9c08850cac9ed8053164b762fd6419e22952eb7b1a36ad1da224 +size 148943 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_60d433cf43a3cb8d1412.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_60d433cf43a3cb8d1412.png new file mode 100644 index 0000000000000000000000000000000000000000..3d134aad94d1ba4ca786fcb26757af5f22e0b782 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_60d433cf43a3cb8d1412.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d433cf43a3cb8d141289e76d86186829d37e199ad24b3cd9f314d064683b81 +size 179518 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_7cd8c962e4d1b79b5dcc.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_7cd8c962e4d1b79b5dcc.png new file mode 100644 index 0000000000000000000000000000000000000000..508e841842459242c459c798515a257d3fc8702d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_7cd8c962e4d1b79b5dcc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cd8c962e4d1b79b5dcc9a9fffa54e2c2c3fc7704df985575327c2ad146acead +size 175060 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_41402987f48490139945.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_41402987f48490139945.png new file mode 100644 index 0000000000000000000000000000000000000000..a80e2aa0aa0e3679562e074024cec89b3c3a9454 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_41402987f48490139945.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41402987f48490139945c8630ce10fcfa3044835d82af955b434a4f01cbb0be4 +size 132213 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_c6c41d57fcadc12fd69b.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_c6c41d57fcadc12fd69b.png new file mode 100644 index 0000000000000000000000000000000000000000..28bdefd5622b7e455c278960cc77ad2d4bc7001f --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_c6c41d57fcadc12fd69b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6c41d57fcadc12fd69bff3e72eec586a0132a0fee1b3e93dc6c6f1b5d2c41d1 +size 164647 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_2d21e8a2ea1688bffb9d.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_2d21e8a2ea1688bffb9d.png new file mode 100644 index 0000000000000000000000000000000000000000..0643baa4f180bfb355192421abae8ea5a0429f31 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_2d21e8a2ea1688bffb9d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d21e8a2ea1688bffb9dcda19294c42221521cf3c627020900931bdae73005d7 +size 176092 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_a609810c96cec2279a46.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_a609810c96cec2279a46.png new file mode 100644 index 0000000000000000000000000000000000000000..051836d358de492344ae4844304ac614265f51b4 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_a609810c96cec2279a46.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a609810c96cec2279a467c884847027bef702479b858c86a3e752d4823f68f25 +size 126319 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_4a1fe2fe98784f7b8841.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_4a1fe2fe98784f7b8841.png new file mode 100644 index 0000000000000000000000000000000000000000..cc30282c0d2ecb0f9c80794ee74c8d8c605a4bf1 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_4a1fe2fe98784f7b8841.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a1fe2fe98784f7b88413b766592c5a488a59591b64999a55556dcfa562776f2 +size 147009 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_6119b9f39242430c319b.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_6119b9f39242430c319b.png new file mode 100644 index 0000000000000000000000000000000000000000..98ced0718ed5d975db5b40e88bb1324df7e18cd2 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_6119b9f39242430c319b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6119b9f39242430c319b0daceaec879ae9ad9083e344212bdef0e1cd151decc6 +size 136868 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_7e0ee18074e9b8d85c45.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_7e0ee18074e9b8d85c45.png new file mode 100644 index 0000000000000000000000000000000000000000..d9f84038f5553861fefff104f431fa13a67fb55a --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_7e0ee18074e9b8d85c45.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0ee18074e9b8d85c455bb9361748e3609fa839ec598d0ba23b72c54cbbd32f +size 161518 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_b01a808f5a897296f898.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_b01a808f5a897296f898.png new file mode 100644 index 0000000000000000000000000000000000000000..352dfd00189ac53d2a35c2f541a7d11ed76db53d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_b01a808f5a897296f898.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01a808f5a897296f898e99594f2fbe5296e949754f2c18ab4c841fbfb3180c1 +size 166963 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_15555bb3e2ce8b16ddcf.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_15555bb3e2ce8b16ddcf.png new file mode 100644 index 0000000000000000000000000000000000000000..203875ac6f102c9e2fa58ad8be4e7af0cd73898e --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_15555bb3e2ce8b16ddcf.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15555bb3e2ce8b16ddcfa20cef56fe272f4cdba3ee6f67eeb67a180bf2eaba00 +size 138955 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_9652b904aa757dce7aeb.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_9652b904aa757dce7aeb.png new file mode 100644 index 0000000000000000000000000000000000000000..f7f4dde7d9bbe7acf63ef12ccf92771deb4d73ab --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_9652b904aa757dce7aeb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9652b904aa757dce7aeb7b829d3941c14c3be1223c9043c20ecbfa2adeb780c8 +size 155441 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_51b1e18deefe31476638.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_51b1e18deefe31476638.png new file mode 100644 index 0000000000000000000000000000000000000000..01ca7c32cd50930ca9b996de5f6a3e8d36c9db2d Binary files /dev/null and b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_51b1e18deefe31476638.png differ diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_c2d1b91c197ca101b350.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_c2d1b91c197ca101b350.png new file mode 100644 index 0000000000000000000000000000000000000000..2d73374aa67fe4facf3a3a89ad1aa7063c54a85f --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_c2d1b91c197ca101b350.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2d1b91c197ca101b35012f0071df85cf87e090310cbb4ab1867e35020df33c5 +size 143284 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_03ba3747205343bd9935.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_03ba3747205343bd9935.png new file mode 100644 index 0000000000000000000000000000000000000000..65a846e6f728e7d27743b9f683fd40cff01f8f1b --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_03ba3747205343bd9935.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ba3747205343bd9935e61ab2385fe52d07526e752022a0229a955d695ac70f +size 176256 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_cc8a6153b15016f58ad3.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_cc8a6153b15016f58ad3.png new file mode 100644 index 0000000000000000000000000000000000000000..46fb24357fdfa794e12e73fae70f4eef89c56781 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_cc8a6153b15016f58ad3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8a6153b15016f58ad3b04fbbafbb6d504df8a5bd258ccc404656e33977d223 +size 137044 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_342589ce9380e8bb866b.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_342589ce9380e8bb866b.png new file mode 100644 index 0000000000000000000000000000000000000000..bb62e002af8d68983844dad322e0201440d38faa --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_342589ce9380e8bb866b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342589ce9380e8bb866b85d1c53a018795105fe91a580b442f2bb385178278ed +size 166363 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_753da26bf467a9c41611.png b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_753da26bf467a9c41611.png new file mode 100644 index 0000000000000000000000000000000000000000..febe374134f9679f855543c80236f716c1901c15 Binary files /dev/null and b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_753da26bf467a9c41611.png differ diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/output.log b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b5f09f5474afa0167497f0a3e1684ef06d6962ad --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/output.log @@ -0,0 +1,1234 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 68.66it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/08/2025 15:59:47 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6064.89it/s] +12/08/2025 15:59:48 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/08/2025 15:59:48 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/08/2025 15:59:48 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 15:59:49 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/08/2025 16:00:04 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 16:00:04 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/08/2025 16:00:06 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/08/2025 16:00:08 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/08/2025 16:00:08 - INFO - __main__ - Wan backbone lr = 0.000030 (base_lr * 0.1) +12/08/2025 16:00:08 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/08/2025 16:00:08 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/08/2025 16:00:15 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/08/2025 16:00:15 - INFO - train.dataset_utils - Using decord for video loading +12/08/2025 16:00:15 - INFO - __main__ - Dataloader configuration: +12/08/2025 16:00:15 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/08/2025 16:00:15 - INFO - __main__ - - prefetch_factor: 2 +12/08/2025 16:00:15 - INFO - __main__ - - persistent_workers: True +12/08/2025 16:00:15 - INFO - __main__ - - pin_memory: True +12/08/2025 16:00:15 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/08/2025 16:00:32 - INFO - __main__ - ***** Running training ***** +12/08/2025 16:00:32 - INFO - __main__ - Num training steps = 10000 +12/08/2025 16:00:32 - INFO - __main__ - Instantaneous batch size per device = 2 +12/08/2025 16:00:32 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/08/2025 16:00:32 - INFO - __main__ - Gradient Accumulation steps = 4 +12/08/2025 16:01:18 - INFO - __main__ - Step: 10 Loss: 11.0765 LR: 0.000030 +12/08/2025 16:01:56 - INFO - __main__ - Step: 20 Loss: 11.0747 LR: 0.000030 +12/08/2025 16:02:34 - INFO - __main__ - Step: 30 Loss: 11.0743 LR: 0.000030 +12/08/2025 16:03:12 - INFO - __main__ - Step: 40 Loss: 11.0727 LR: 0.000030 +12/08/2025 16:03:51 - INFO - __main__ - Step: 50 Loss: 11.0682 LR: 0.000030 +12/08/2025 16:04:29 - INFO - __main__ - Step: 60 Loss: 11.0666 LR: 0.000030 +12/08/2025 16:05:07 - INFO - __main__ - Step: 70 Loss: 11.0632 LR: 0.000030 +12/08/2025 16:05:45 - INFO - __main__ - Step: 80 Loss: 11.0599 LR: 0.000030 +12/08/2025 16:06:22 - INFO - __main__ - Step: 90 Loss: 11.0537 LR: 0.000030 +12/08/2025 16:07:00 - INFO - __main__ - Step: 100 Loss: 11.0397 LR: 0.000030 +12/08/2025 16:07:37 - INFO - __main__ - Step: 110 Loss: 11.0273 LR: 0.000030 +12/08/2025 16:08:15 - INFO - __main__ - Step: 120 Loss: 11.0027 LR: 0.000030 +12/08/2025 16:08:52 - INFO - __main__ - Step: 130 Loss: 10.9829 LR: 0.000030 +12/08/2025 16:09:30 - INFO - __main__ - Step: 140 Loss: 10.9128 LR: 0.000030 +12/08/2025 16:10:08 - INFO - __main__ - Step: 150 Loss: 10.8865 LR: 0.000030 +12/08/2025 16:10:46 - INFO - __main__ - Step: 160 Loss: 10.8413 LR: 0.000030 +12/08/2025 16:11:24 - INFO - __main__ - Step: 170 Loss: 10.8309 LR: 0.000030 +12/08/2025 16:12:02 - INFO - __main__ - Step: 180 Loss: 10.7915 LR: 0.000030 +12/08/2025 16:12:40 - INFO - __main__ - Step: 190 Loss: 10.7034 LR: 0.000030 +12/08/2025 16:13:17 - INFO - __main__ - Step: 200 Loss: 10.7173 LR: 0.000030 +12/08/2025 16:13:56 - INFO - __main__ - Step: 210 Loss: 10.6464 LR: 0.000030 +12/08/2025 16:14:35 - INFO - __main__ - Step: 220 Loss: 10.5748 LR: 0.000030 +12/08/2025 16:15:13 - INFO - __main__ - Step: 230 Loss: 10.6039 LR: 0.000030 +12/08/2025 16:15:51 - INFO - __main__ - Step: 240 Loss: 10.5468 LR: 0.000030 +12/08/2025 16:16:29 - INFO - __main__ - Step: 250 Loss: 10.5881 LR: 0.000030 +12/08/2025 16:17:07 - INFO - __main__ - Step: 260 Loss: 10.4530 LR: 0.000030 +12/08/2025 16:17:46 - INFO - __main__ - Step: 270 Loss: 10.4748 LR: 0.000030 +12/08/2025 16:18:23 - INFO - __main__ - Step: 280 Loss: 10.4648 LR: 0.000030 +12/08/2025 16:19:00 - INFO - __main__ - Step: 290 Loss: 10.5356 LR: 0.000030 +12/08/2025 16:19:38 - INFO - __main__ - Step: 300 Loss: 10.4129 LR: 0.000030 +12/08/2025 16:20:16 - INFO - __main__ - Step: 310 Loss: 10.5077 LR: 0.000030 +12/08/2025 16:20:55 - INFO - __main__ - Step: 320 Loss: 10.5066 LR: 0.000030 +12/08/2025 16:21:32 - INFO - __main__ - Step: 330 Loss: 10.4071 LR: 0.000030 +12/08/2025 16:22:12 - INFO - __main__ - Step: 340 Loss: 10.4521 LR: 0.000030 +12/08/2025 16:22:49 - INFO - __main__ - Step: 350 Loss: 10.4234 LR: 0.000030 +12/08/2025 16:23:25 - INFO - __main__ - Step: 360 Loss: 10.2828 LR: 0.000030 +12/08/2025 16:24:02 - INFO - __main__ - Step: 370 Loss: 10.4057 LR: 0.000030 +12/08/2025 16:24:39 - INFO - __main__ - Step: 380 Loss: 10.4306 LR: 0.000030 +12/08/2025 16:25:17 - INFO - __main__ - Step: 390 Loss: 10.4251 LR: 0.000030 +12/08/2025 16:25:54 - INFO - __main__ - Step: 400 Loss: 10.3598 LR: 0.000030 +12/08/2025 16:26:31 - INFO - __main__ - Step: 410 Loss: 10.4499 LR: 0.000030 +12/08/2025 16:27:08 - INFO - __main__ - Step: 420 Loss: 10.3914 LR: 0.000030 +12/08/2025 16:27:47 - INFO - __main__ - Step: 430 Loss: 10.4325 LR: 0.000030 +12/08/2025 16:28:24 - INFO - __main__ - Step: 440 Loss: 10.4138 LR: 0.000030 +12/08/2025 16:29:01 - INFO - __main__ - Step: 450 Loss: 10.4066 LR: 0.000030 +12/08/2025 16:29:39 - INFO - __main__ - Step: 460 Loss: 10.4038 LR: 0.000030 +12/08/2025 16:30:17 - INFO - __main__ - Step: 470 Loss: 10.3804 LR: 0.000030 +12/08/2025 16:30:56 - INFO - __main__ - Step: 480 Loss: 10.4564 LR: 0.000030 +12/08/2025 16:31:35 - INFO - __main__ - Step: 490 Loss: 10.4131 LR: 0.000030 +12/08/2025 16:32:12 - INFO - __main__ - Step: 500 Loss: 10.4588 LR: 0.000030 +12/08/2025 16:32:12 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500 +12/08/2025 16:32:20 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/optimizer.bin +12/08/2025 16:32:20 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/scheduler.bin +12/08/2025 16:32:20 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/sampler.bin +12/08/2025 16:32:20 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500/random_states_0.pkl +12/08/2025 16:32:20 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-500 +12/08/2025 16:32:20 - INFO - __main__ - Generating videos for validation... +12/08/2025 16:32:20 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.67it/s] +12/08/2025 16:32:28 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 16:33:06 - INFO - __main__ - Step: 510 Loss: 10.2678 LR: 0.000030 +12/08/2025 16:33:48 - INFO - __main__ - Step: 520 Loss: 10.3771 LR: 0.000030 +12/08/2025 16:34:26 - INFO - __main__ - Step: 530 Loss: 10.2721 LR: 0.000030 +12/08/2025 16:35:03 - INFO - __main__ - Step: 540 Loss: 10.3496 LR: 0.000030 +12/08/2025 16:35:41 - INFO - __main__ - Step: 550 Loss: 10.3773 LR: 0.000030 +12/08/2025 16:36:19 - INFO - __main__ - Step: 560 Loss: 10.4052 LR: 0.000030 +12/08/2025 16:36:59 - INFO - __main__ - Step: 570 Loss: 10.4429 LR: 0.000030 +12/08/2025 16:37:36 - INFO - __main__ - Step: 580 Loss: 10.3743 LR: 0.000030 +12/08/2025 16:38:14 - INFO - __main__ - Step: 590 Loss: 10.4892 LR: 0.000030 +12/08/2025 16:38:50 - INFO - __main__ - Step: 600 Loss: 10.4520 LR: 0.000030 +12/08/2025 16:39:27 - INFO - __main__ - Step: 610 Loss: 10.3981 LR: 0.000030 +12/08/2025 16:40:05 - INFO - __main__ - Step: 620 Loss: 10.3463 LR: 0.000030 +12/08/2025 16:40:42 - INFO - __main__ - Step: 630 Loss: 10.4125 LR: 0.000030 +12/08/2025 16:41:21 - INFO - __main__ - Step: 640 Loss: 10.4231 LR: 0.000030 +12/08/2025 16:41:58 - INFO - __main__ - Step: 650 Loss: 10.3965 LR: 0.000030 +12/08/2025 16:42:37 - INFO - __main__ - Step: 660 Loss: 10.3814 LR: 0.000030 +12/08/2025 16:43:15 - INFO - __main__ - Step: 670 Loss: 10.3961 LR: 0.000030 +12/08/2025 16:43:52 - INFO - __main__ - Step: 680 Loss: 10.4251 LR: 0.000030 +12/08/2025 16:44:29 - INFO - __main__ - Step: 690 Loss: 10.4327 LR: 0.000030 +12/08/2025 16:45:07 - INFO - __main__ - Step: 700 Loss: 10.2706 LR: 0.000030 +12/08/2025 16:45:44 - INFO - __main__ - Step: 710 Loss: 10.3643 LR: 0.000030 +12/08/2025 16:46:24 - INFO - __main__ - Step: 720 Loss: 10.4459 LR: 0.000030 +12/08/2025 16:47:02 - INFO - __main__ - Step: 730 Loss: 10.4146 LR: 0.000030 +12/08/2025 16:47:40 - INFO - __main__ - Step: 740 Loss: 10.3954 LR: 0.000030 +12/08/2025 16:48:20 - INFO - __main__ - Step: 750 Loss: 10.3323 LR: 0.000030 +12/08/2025 16:48:59 - INFO - __main__ - Step: 760 Loss: 10.3137 LR: 0.000030 +12/08/2025 16:49:37 - INFO - __main__ - Step: 770 Loss: 10.3287 LR: 0.000030 +12/08/2025 16:50:15 - INFO - __main__ - Step: 780 Loss: 10.3346 LR: 0.000030 +12/08/2025 16:50:52 - INFO - __main__ - Step: 790 Loss: 10.3639 LR: 0.000030 +12/08/2025 16:51:29 - INFO - __main__ - Step: 800 Loss: 10.2505 LR: 0.000030 +12/08/2025 16:52:07 - INFO - __main__ - Step: 810 Loss: 10.4047 LR: 0.000030 +12/08/2025 16:52:45 - INFO - __main__ - Step: 820 Loss: 10.4114 LR: 0.000030 +12/08/2025 16:53:23 - INFO - __main__ - Step: 830 Loss: 10.3459 LR: 0.000030 +12/08/2025 16:54:01 - INFO - __main__ - Step: 840 Loss: 10.2657 LR: 0.000030 +12/08/2025 16:54:38 - INFO - __main__ - Step: 850 Loss: 10.4799 LR: 0.000030 +12/08/2025 16:55:16 - INFO - __main__ - Step: 860 Loss: 10.3322 LR: 0.000030 +12/08/2025 16:55:57 - INFO - __main__ - Step: 870 Loss: 10.2937 LR: 0.000030 +12/08/2025 16:56:34 - INFO - __main__ - Step: 880 Loss: 10.4723 LR: 0.000030 +12/08/2025 16:57:11 - INFO - __main__ - Step: 890 Loss: 10.2729 LR: 0.000030 +12/08/2025 16:57:47 - INFO - __main__ - Step: 900 Loss: 10.3868 LR: 0.000030 +12/08/2025 16:58:25 - INFO - __main__ - Step: 910 Loss: 10.4276 LR: 0.000030 +12/08/2025 16:59:01 - INFO - __main__ - Step: 920 Loss: 10.3418 LR: 0.000030 +12/08/2025 16:59:39 - INFO - __main__ - Step: 930 Loss: 10.4002 LR: 0.000030 +12/08/2025 17:00:18 - INFO - __main__ - Step: 940 Loss: 10.3860 LR: 0.000030 +12/08/2025 17:00:55 - INFO - __main__ - Step: 950 Loss: 10.3090 LR: 0.000030 +12/08/2025 17:01:31 - INFO - __main__ - Step: 960 Loss: 10.3547 LR: 0.000030 +12/08/2025 17:02:10 - INFO - __main__ - Step: 970 Loss: 10.4002 LR: 0.000030 +12/08/2025 17:02:47 - INFO - __main__ - Step: 980 Loss: 10.3720 LR: 0.000030 +12/08/2025 17:03:23 - INFO - __main__ - Step: 990 Loss: 10.3952 LR: 0.000030 +12/08/2025 17:04:00 - INFO - __main__ - Step: 1000 Loss: 10.3774 LR: 0.000030 +12/08/2025 17:04:00 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000 +12/08/2025 17:04:08 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/optimizer.bin +12/08/2025 17:04:08 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/scheduler.bin +12/08/2025 17:04:08 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/sampler.bin +12/08/2025 17:04:08 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000/random_states_0.pkl +12/08/2025 17:04:08 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1000 +12/08/2025 17:04:08 - INFO - __main__ - Generating videos for validation... +12/08/2025 17:04:08 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.31it/s] +12/08/2025 17:04:16 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 17:04:56 - INFO - __main__ - Step: 1010 Loss: 10.3488 LR: 0.000030 +12/08/2025 17:05:38 - INFO - __main__ - Step: 1020 Loss: 10.4207 LR: 0.000030 +12/08/2025 17:06:15 - INFO - __main__ - Step: 1030 Loss: 10.2589 LR: 0.000030 +12/08/2025 17:06:54 - INFO - __main__ - Step: 1040 Loss: 10.4372 LR: 0.000030 +12/08/2025 17:07:30 - INFO - __main__ - Step: 1050 Loss: 10.3223 LR: 0.000030 +12/08/2025 17:08:10 - INFO - __main__ - Step: 1060 Loss: 10.3129 LR: 0.000030 +12/08/2025 17:08:57 - INFO - __main__ - Step: 1070 Loss: 10.3165 LR: 0.000030 +12/08/2025 17:09:34 - INFO - __main__ - Step: 1080 Loss: 10.4149 LR: 0.000030 +12/08/2025 17:10:10 - INFO - __main__ - Step: 1090 Loss: 10.2954 LR: 0.000030 +12/08/2025 17:10:48 - INFO - __main__ - Step: 1100 Loss: 10.3628 LR: 0.000030 +12/08/2025 17:11:25 - INFO - __main__ - Step: 1110 Loss: 10.3230 LR: 0.000030 +12/08/2025 17:12:01 - INFO - __main__ - Step: 1120 Loss: 10.3802 LR: 0.000030 +12/08/2025 17:12:38 - INFO - __main__ - Step: 1130 Loss: 10.4042 LR: 0.000030 +12/08/2025 17:13:15 - INFO - __main__ - Step: 1140 Loss: 10.3732 LR: 0.000030 +12/08/2025 17:13:52 - INFO - __main__ - Step: 1150 Loss: 10.2436 LR: 0.000030 +12/08/2025 17:14:31 - INFO - __main__ - Step: 1160 Loss: 10.3452 LR: 0.000030 +12/08/2025 17:15:09 - INFO - __main__ - Step: 1170 Loss: 10.3288 LR: 0.000030 +12/08/2025 17:15:47 - INFO - __main__ - Step: 1180 Loss: 10.3345 LR: 0.000030 +12/08/2025 17:16:25 - INFO - __main__ - Step: 1190 Loss: 10.3835 LR: 0.000030 +12/08/2025 17:17:02 - INFO - __main__ - Step: 1200 Loss: 10.2665 LR: 0.000030 +12/08/2025 17:17:41 - INFO - __main__ - Step: 1210 Loss: 10.3739 LR: 0.000030 +12/08/2025 17:18:19 - INFO - __main__ - Step: 1220 Loss: 10.4482 LR: 0.000030 +12/08/2025 17:19:02 - INFO - __main__ - Step: 1230 Loss: 10.1238 LR: 0.000030 +12/08/2025 17:19:39 - INFO - __main__ - Step: 1240 Loss: 10.3839 LR: 0.000030 +12/08/2025 17:20:16 - INFO - __main__ - Step: 1250 Loss: 10.3929 LR: 0.000030 +12/08/2025 17:20:52 - INFO - __main__ - Step: 1260 Loss: 10.2856 LR: 0.000030 +12/08/2025 17:21:29 - INFO - __main__ - Step: 1270 Loss: 10.3958 LR: 0.000030 +12/08/2025 17:22:07 - INFO - __main__ - Step: 1280 Loss: 10.4225 LR: 0.000030 +12/08/2025 17:22:44 - INFO - __main__ - Step: 1290 Loss: 10.3787 LR: 0.000030 +12/08/2025 17:23:21 - INFO - __main__ - Step: 1300 Loss: 10.2556 LR: 0.000030 +12/08/2025 17:23:58 - INFO - __main__ - Step: 1310 Loss: 10.3971 LR: 0.000030 +12/08/2025 17:24:34 - INFO - __main__ - Step: 1320 Loss: 10.1756 LR: 0.000030 +12/08/2025 17:25:14 - INFO - __main__ - Step: 1330 Loss: 10.3488 LR: 0.000030 +12/08/2025 17:25:50 - INFO - __main__ - Step: 1340 Loss: 10.3324 LR: 0.000030 +12/08/2025 17:26:27 - INFO - __main__ - Step: 1350 Loss: 10.3611 LR: 0.000030 +12/08/2025 17:27:04 - INFO - __main__ - Step: 1360 Loss: 10.3655 LR: 0.000030 +12/08/2025 17:27:41 - INFO - __main__ - Step: 1370 Loss: 10.1179 LR: 0.000030 +12/08/2025 17:28:17 - INFO - __main__ - Step: 1380 Loss: 10.3356 LR: 0.000030 +12/08/2025 17:28:54 - INFO - __main__ - Step: 1390 Loss: 10.3898 LR: 0.000030 +12/08/2025 17:29:31 - INFO - __main__ - Step: 1400 Loss: 10.3390 LR: 0.000030 +12/08/2025 17:30:11 - INFO - __main__ - Step: 1410 Loss: 10.1545 LR: 0.000030 +12/08/2025 17:30:49 - INFO - __main__ - Step: 1420 Loss: 10.4155 LR: 0.000030 +12/08/2025 17:31:25 - INFO - __main__ - Step: 1430 Loss: 10.3101 LR: 0.000030 +12/08/2025 17:32:02 - INFO - __main__ - Step: 1440 Loss: 10.1570 LR: 0.000030 +12/08/2025 17:32:38 - INFO - __main__ - Step: 1450 Loss: 10.3250 LR: 0.000030 +12/08/2025 17:33:15 - INFO - __main__ - Step: 1460 Loss: 10.3853 LR: 0.000030 +12/08/2025 17:33:51 - INFO - __main__ - Step: 1470 Loss: 10.3949 LR: 0.000030 +12/08/2025 17:34:28 - INFO - __main__ - Step: 1480 Loss: 10.3797 LR: 0.000030 +12/08/2025 17:35:05 - INFO - __main__ - Step: 1490 Loss: 10.3865 LR: 0.000030 +12/08/2025 17:35:41 - INFO - __main__ - Step: 1500 Loss: 10.3832 LR: 0.000030 +12/08/2025 17:35:41 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500 +12/08/2025 17:35:51 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/optimizer.bin +12/08/2025 17:35:51 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/scheduler.bin +12/08/2025 17:35:51 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/sampler.bin +12/08/2025 17:35:51 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500/random_states_0.pkl +12/08/2025 17:35:51 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-1500 +12/08/2025 17:35:51 - INFO - __main__ - Generating videos for validation... +12/08/2025 17:35:51 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.66it/s] +12/08/2025 17:35:58 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 17:36:36 - INFO - __main__ - Step: 1510 Loss: 10.2973 LR: 0.000030 +12/08/2025 17:37:13 - INFO - __main__ - Step: 1520 Loss: 10.1411 LR: 0.000030 +12/08/2025 17:37:53 - INFO - __main__ - Step: 1530 Loss: 10.2939 LR: 0.000030 +12/08/2025 17:38:30 - INFO - __main__ - Step: 1540 Loss: 10.4230 LR: 0.000030 +12/08/2025 17:39:07 - INFO - __main__ - Step: 1550 Loss: 10.4209 LR: 0.000030 +12/08/2025 17:39:46 - INFO - __main__ - Step: 1560 Loss: 10.2173 LR: 0.000030 +12/08/2025 17:40:23 - INFO - __main__ - Step: 1570 Loss: 10.3776 LR: 0.000030 +12/08/2025 17:41:00 - INFO - __main__ - Step: 1580 Loss: 10.2596 LR: 0.000030 +12/08/2025 17:41:39 - INFO - __main__ - Step: 1590 Loss: 10.3701 LR: 0.000030 +12/08/2025 17:42:17 - INFO - __main__ - Step: 1600 Loss: 10.3020 LR: 0.000030 +12/08/2025 17:42:55 - INFO - __main__ - Step: 1610 Loss: 10.3507 LR: 0.000030 +12/08/2025 17:43:34 - INFO - __main__ - Step: 1620 Loss: 10.4234 LR: 0.000030 +12/08/2025 17:44:12 - INFO - __main__ - Step: 1630 Loss: 10.3653 LR: 0.000030 +12/08/2025 17:44:51 - INFO - __main__ - Step: 1640 Loss: 10.3003 LR: 0.000030 +12/08/2025 17:45:30 - INFO - __main__ - Step: 1650 Loss: 10.2873 LR: 0.000030 +12/08/2025 17:46:09 - INFO - __main__ - Step: 1660 Loss: 10.2909 LR: 0.000030 +12/08/2025 17:46:46 - INFO - __main__ - Step: 1670 Loss: 10.3515 LR: 0.000030 +12/08/2025 17:47:25 - INFO - __main__ - Step: 1680 Loss: 10.0193 LR: 0.000030 +12/08/2025 17:48:05 - INFO - __main__ - Step: 1690 Loss: 10.4187 LR: 0.000030 +12/08/2025 17:48:44 - INFO - __main__ - Step: 1700 Loss: 10.4093 LR: 0.000030 +12/08/2025 17:49:23 - INFO - __main__ - Step: 1710 Loss: 10.2773 LR: 0.000030 +12/08/2025 17:50:03 - INFO - __main__ - Step: 1720 Loss: 10.3371 LR: 0.000030 +12/08/2025 17:50:43 - INFO - __main__ - Step: 1730 Loss: 10.3882 LR: 0.000030 +12/08/2025 17:51:22 - INFO - __main__ - Step: 1740 Loss: 10.3619 LR: 0.000030 +12/08/2025 17:52:02 - INFO - __main__ - Step: 1750 Loss: 10.3271 LR: 0.000030 +12/08/2025 17:52:41 - INFO - __main__ - Step: 1760 Loss: 10.4465 LR: 0.000030 +12/08/2025 17:53:18 - INFO - __main__ - Step: 1770 Loss: 10.3722 LR: 0.000030 +12/08/2025 17:53:57 - INFO - __main__ - Step: 1780 Loss: 10.2072 LR: 0.000030 +12/08/2025 17:54:35 - INFO - __main__ - Step: 1790 Loss: 10.4272 LR: 0.000030 +12/08/2025 17:55:14 - INFO - __main__ - Step: 1800 Loss: 10.3846 LR: 0.000030 +12/08/2025 17:55:53 - INFO - __main__ - Step: 1810 Loss: 10.4292 LR: 0.000030 +12/08/2025 17:56:32 - INFO - __main__ - Step: 1820 Loss: 10.3997 LR: 0.000030 +12/08/2025 17:57:12 - INFO - __main__ - Step: 1830 Loss: 10.3803 LR: 0.000030 +12/08/2025 17:57:50 - INFO - __main__ - Step: 1840 Loss: 10.1679 LR: 0.000030 +12/08/2025 17:58:30 - INFO - __main__ - Step: 1850 Loss: 10.4325 LR: 0.000030 +12/08/2025 17:59:08 - INFO - __main__ - Step: 1860 Loss: 10.3337 LR: 0.000030 +12/08/2025 17:59:46 - INFO - __main__ - Step: 1870 Loss: 10.3278 LR: 0.000030 +12/08/2025 18:00:28 - INFO - __main__ - Step: 1880 Loss: 10.2863 LR: 0.000030 +12/08/2025 18:01:06 - INFO - __main__ - Step: 1890 Loss: 10.2458 LR: 0.000030 +12/08/2025 18:01:46 - INFO - __main__ - Step: 1900 Loss: 10.3747 LR: 0.000030 +12/08/2025 18:02:24 - INFO - __main__ - Step: 1910 Loss: 10.4138 LR: 0.000030 +12/08/2025 18:03:03 - INFO - __main__ - Step: 1920 Loss: 10.4281 LR: 0.000030 +12/08/2025 18:03:42 - INFO - __main__ - Step: 1930 Loss: 10.3979 LR: 0.000030 +12/08/2025 18:04:20 - INFO - __main__ - Step: 1940 Loss: 10.3887 LR: 0.000030 +12/08/2025 18:05:00 - INFO - __main__ - Step: 1950 Loss: 10.2837 LR: 0.000030 +12/08/2025 18:05:39 - INFO - __main__ - Step: 1960 Loss: 10.4395 LR: 0.000030 +12/08/2025 18:06:19 - INFO - __main__ - Step: 1970 Loss: 10.4266 LR: 0.000030 +12/08/2025 18:06:59 - INFO - __main__ - Step: 1980 Loss: 10.3031 LR: 0.000030 +12/08/2025 18:07:38 - INFO - __main__ - Step: 1990 Loss: 10.4140 LR: 0.000030 +12/08/2025 18:08:19 - INFO - __main__ - Step: 2000 Loss: 10.4107 LR: 0.000030 +12/08/2025 18:08:19 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000 +12/08/2025 18:08:27 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/optimizer.bin +12/08/2025 18:08:27 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/scheduler.bin +12/08/2025 18:08:27 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/sampler.bin +12/08/2025 18:08:27 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000/random_states_0.pkl +12/08/2025 18:08:27 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2000 +12/08/2025 18:08:27 - INFO - __main__ - Generating videos for validation... +12/08/2025 18:08:27 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.82it/s] +12/08/2025 18:08:33 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 18:09:12 - INFO - __main__ - Step: 2010 Loss: 10.3039 LR: 0.000030 +12/08/2025 18:09:54 - INFO - __main__ - Step: 2020 Loss: 10.3751 LR: 0.000030 +12/08/2025 18:10:33 - INFO - __main__ - Step: 2030 Loss: 10.3716 LR: 0.000030 +12/08/2025 18:11:12 - INFO - __main__ - Step: 2040 Loss: 10.3147 LR: 0.000030 +12/08/2025 18:11:50 - INFO - __main__ - Step: 2050 Loss: 10.3584 LR: 0.000030 +12/08/2025 18:12:29 - INFO - __main__ - Step: 2060 Loss: 10.3668 LR: 0.000030 +12/08/2025 18:13:08 - INFO - __main__ - Step: 2070 Loss: 10.2852 LR: 0.000030 +12/08/2025 18:13:47 - INFO - __main__ - Step: 2080 Loss: 10.3371 LR: 0.000030 +12/08/2025 18:14:27 - INFO - __main__ - Step: 2090 Loss: 10.4287 LR: 0.000030 +12/08/2025 18:15:06 - INFO - __main__ - Step: 2100 Loss: 10.3314 LR: 0.000030 +12/08/2025 18:15:44 - INFO - __main__ - Step: 2110 Loss: 10.3791 LR: 0.000030 +12/08/2025 18:16:23 - INFO - __main__ - Step: 2120 Loss: 10.3936 LR: 0.000030 +12/08/2025 18:17:02 - INFO - __main__ - Step: 2130 Loss: 10.3446 LR: 0.000030 +12/08/2025 18:17:41 - INFO - __main__ - Step: 2140 Loss: 10.3441 LR: 0.000030 +12/08/2025 18:18:18 - INFO - __main__ - Step: 2150 Loss: 10.2765 LR: 0.000030 +12/08/2025 18:18:57 - INFO - __main__ - Step: 2160 Loss: 10.2760 LR: 0.000030 +12/08/2025 18:19:35 - INFO - __main__ - Step: 2170 Loss: 10.4445 LR: 0.000030 +12/08/2025 18:20:14 - INFO - __main__ - Step: 2180 Loss: 10.4068 LR: 0.000030 +12/08/2025 18:20:54 - INFO - __main__ - Step: 2190 Loss: 10.3239 LR: 0.000030 +12/08/2025 18:21:32 - INFO - __main__ - Step: 2200 Loss: 10.3521 LR: 0.000030 +12/08/2025 18:22:11 - INFO - __main__ - Step: 2210 Loss: 10.3568 LR: 0.000030 +12/08/2025 18:22:50 - INFO - __main__ - Step: 2220 Loss: 10.1969 LR: 0.000030 +12/08/2025 18:23:29 - INFO - __main__ - Step: 2230 Loss: 10.3199 LR: 0.000030 +12/08/2025 18:24:07 - INFO - __main__ - Step: 2240 Loss: 10.2889 LR: 0.000030 +12/08/2025 18:24:45 - INFO - __main__ - Step: 2250 Loss: 10.3757 LR: 0.000030 +12/08/2025 18:25:25 - INFO - __main__ - Step: 2260 Loss: 10.3597 LR: 0.000030 +12/08/2025 18:26:03 - INFO - __main__ - Step: 2270 Loss: 10.3497 LR: 0.000030 +12/08/2025 18:26:42 - INFO - __main__ - Step: 2280 Loss: 10.3869 LR: 0.000030 +12/08/2025 18:27:20 - INFO - __main__ - Step: 2290 Loss: 10.4222 LR: 0.000030 +12/08/2025 18:27:56 - INFO - __main__ - Step: 2300 Loss: 10.3973 LR: 0.000030 +12/08/2025 18:28:35 - INFO - __main__ - Step: 2310 Loss: 10.2908 LR: 0.000030 +12/08/2025 18:29:14 - INFO - __main__ - Step: 2320 Loss: 10.4200 LR: 0.000030 +12/08/2025 18:29:52 - INFO - __main__ - Step: 2330 Loss: 10.1784 LR: 0.000030 +12/08/2025 18:30:31 - INFO - __main__ - Step: 2340 Loss: 10.2928 LR: 0.000030 +12/08/2025 18:31:09 - INFO - __main__ - Step: 2350 Loss: 10.1785 LR: 0.000030 +12/08/2025 18:31:47 - INFO - __main__ - Step: 2360 Loss: 10.0671 LR: 0.000030 +12/08/2025 18:32:25 - INFO - __main__ - Step: 2370 Loss: 10.3262 LR: 0.000030 +12/08/2025 18:33:02 - INFO - __main__ - Step: 2380 Loss: 10.3750 LR: 0.000030 +12/08/2025 18:33:41 - INFO - __main__ - Step: 2390 Loss: 10.3780 LR: 0.000030 +12/08/2025 18:34:22 - INFO - __main__ - Step: 2400 Loss: 10.3725 LR: 0.000030 +12/08/2025 18:35:01 - INFO - __main__ - Step: 2410 Loss: 10.3987 LR: 0.000030 +12/08/2025 18:35:41 - INFO - __main__ - Step: 2420 Loss: 10.3628 LR: 0.000030 +12/08/2025 18:36:20 - INFO - __main__ - Step: 2430 Loss: 10.2904 LR: 0.000030 +12/08/2025 18:36:57 - INFO - __main__ - Step: 2440 Loss: 10.1454 LR: 0.000030 +12/08/2025 18:37:36 - INFO - __main__ - Step: 2450 Loss: 10.3886 LR: 0.000030 +12/08/2025 18:38:14 - INFO - __main__ - Step: 2460 Loss: 10.3994 LR: 0.000030 +12/08/2025 18:38:55 - INFO - __main__ - Step: 2470 Loss: 10.3844 LR: 0.000030 +12/08/2025 18:39:34 - INFO - __main__ - Step: 2480 Loss: 10.3436 LR: 0.000030 +12/08/2025 18:40:12 - INFO - __main__ - Step: 2490 Loss: 10.2371 LR: 0.000030 +12/08/2025 18:40:51 - INFO - __main__ - Step: 2500 Loss: 10.3769 LR: 0.000030 +12/08/2025 18:40:51 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500 +12/08/2025 18:40:59 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/optimizer.bin +12/08/2025 18:40:59 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/scheduler.bin +12/08/2025 18:40:59 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/sampler.bin +12/08/2025 18:40:59 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500/random_states_0.pkl +12/08/2025 18:40:59 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-2500 +12/08/2025 18:40:59 - INFO - __main__ - Generating videos for validation... +12/08/2025 18:40:59 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.74it/s] +12/08/2025 18:41:05 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 18:41:44 - INFO - __main__ - Step: 2510 Loss: 10.4104 LR: 0.000030 +12/08/2025 18:42:23 - INFO - __main__ - Step: 2520 Loss: 10.3128 LR: 0.000030 +12/08/2025 18:43:01 - INFO - __main__ - Step: 2530 Loss: 10.3471 LR: 0.000030 +12/08/2025 18:43:41 - INFO - __main__ - Step: 2540 Loss: 10.2331 LR: 0.000030 +12/08/2025 18:44:20 - INFO - __main__ - Step: 2550 Loss: 10.4053 LR: 0.000030 +12/08/2025 18:45:00 - INFO - __main__ - Step: 2560 Loss: 10.3362 LR: 0.000030 +12/08/2025 18:45:39 - INFO - __main__ - Step: 2570 Loss: 10.1949 LR: 0.000030 +12/08/2025 18:46:18 - INFO - __main__ - Step: 2580 Loss: 10.3445 LR: 0.000030 +12/08/2025 18:46:58 - INFO - __main__ - Step: 2590 Loss: 10.2847 LR: 0.000030 +12/08/2025 18:47:38 - INFO - __main__ - Step: 2600 Loss: 10.3255 LR: 0.000030 +12/08/2025 18:48:17 - INFO - __main__ - Step: 2610 Loss: 10.3907 LR: 0.000030 +12/08/2025 18:48:56 - INFO - __main__ - Step: 2620 Loss: 10.2480 LR: 0.000030 +12/08/2025 18:49:35 - INFO - __main__ - Step: 2630 Loss: 10.3263 LR: 0.000030 +12/08/2025 18:50:15 - INFO - __main__ - Step: 2640 Loss: 10.1264 LR: 0.000030 +12/08/2025 18:50:55 - INFO - __main__ - Step: 2650 Loss: 10.4146 LR: 0.000030 +12/08/2025 18:51:35 - INFO - __main__ - Step: 2660 Loss: 10.3681 LR: 0.000030 +12/08/2025 18:52:13 - INFO - __main__ - Step: 2670 Loss: 10.2980 LR: 0.000030 +12/08/2025 18:52:53 - INFO - __main__ - Step: 2680 Loss: 10.3355 LR: 0.000030 +12/08/2025 18:53:32 - INFO - __main__ - Step: 2690 Loss: 10.2069 LR: 0.000030 +12/08/2025 18:54:12 - INFO - __main__ - Step: 2700 Loss: 10.3311 LR: 0.000030 +12/08/2025 18:54:52 - INFO - __main__ - Step: 2710 Loss: 10.3150 LR: 0.000030 +12/08/2025 18:55:31 - INFO - __main__ - Step: 2720 Loss: 10.2567 LR: 0.000030 +12/08/2025 18:56:11 - INFO - __main__ - Step: 2730 Loss: 10.2787 LR: 0.000030 +12/08/2025 18:56:50 - INFO - __main__ - Step: 2740 Loss: 10.3526 LR: 0.000030 +12/08/2025 18:57:29 - INFO - __main__ - Step: 2750 Loss: 10.3628 LR: 0.000030 +12/08/2025 18:58:08 - INFO - __main__ - Step: 2760 Loss: 10.2796 LR: 0.000030 +12/08/2025 18:58:47 - INFO - __main__ - Step: 2770 Loss: 10.2616 LR: 0.000030 +12/08/2025 18:59:27 - INFO - __main__ - Step: 2780 Loss: 10.3035 LR: 0.000030 +12/08/2025 19:00:05 - INFO - __main__ - Step: 2790 Loss: 10.3651 LR: 0.000030 +12/08/2025 19:00:45 - INFO - __main__ - Step: 2800 Loss: 10.3469 LR: 0.000030 +12/08/2025 19:01:25 - INFO - __main__ - Step: 2810 Loss: 10.3005 LR: 0.000030 +12/08/2025 19:02:05 - INFO - __main__ - Step: 2820 Loss: 10.2700 LR: 0.000030 +12/08/2025 19:02:44 - INFO - __main__ - Step: 2830 Loss: 10.3371 LR: 0.000030 +12/08/2025 19:03:24 - INFO - __main__ - Step: 2840 Loss: 10.3848 LR: 0.000030 +12/08/2025 19:04:04 - INFO - __main__ - Step: 2850 Loss: 10.3789 LR: 0.000030 +12/08/2025 19:04:42 - INFO - __main__ - Step: 2860 Loss: 10.3298 LR: 0.000030 +12/08/2025 19:05:22 - INFO - __main__ - Step: 2870 Loss: 10.3286 LR: 0.000030 +12/08/2025 19:06:00 - INFO - __main__ - Step: 2880 Loss: 10.2525 LR: 0.000030 +12/08/2025 19:06:39 - INFO - __main__ - Step: 2890 Loss: 10.4446 LR: 0.000030 +12/08/2025 19:07:19 - INFO - __main__ - Step: 2900 Loss: 10.3086 LR: 0.000030 +12/08/2025 19:07:59 - INFO - __main__ - Step: 2910 Loss: 10.3149 LR: 0.000030 +12/08/2025 19:08:39 - INFO - __main__ - Step: 2920 Loss: 10.3575 LR: 0.000030 +12/08/2025 19:09:18 - INFO - __main__ - Step: 2930 Loss: 10.2278 LR: 0.000030 +12/08/2025 19:09:57 - INFO - __main__ - Step: 2940 Loss: 10.2349 LR: 0.000030 +12/08/2025 19:10:36 - INFO - __main__ - Step: 2950 Loss: 10.4023 LR: 0.000030 +12/08/2025 19:11:15 - INFO - __main__ - Step: 2960 Loss: 10.2979 LR: 0.000030 +12/08/2025 19:11:54 - INFO - __main__ - Step: 2970 Loss: 10.3096 LR: 0.000030 +12/08/2025 19:12:35 - INFO - __main__ - Step: 2980 Loss: 10.3134 LR: 0.000030 +12/08/2025 19:13:13 - INFO - __main__ - Step: 2990 Loss: 10.3931 LR: 0.000030 +12/08/2025 19:13:54 - INFO - __main__ - Step: 3000 Loss: 10.3829 LR: 0.000030 +12/08/2025 19:13:54 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000 +12/08/2025 19:14:02 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/optimizer.bin +12/08/2025 19:14:02 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/scheduler.bin +12/08/2025 19:14:02 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/sampler.bin +12/08/2025 19:14:02 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000/random_states_0.pkl +12/08/2025 19:14:02 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3000 +12/08/2025 19:14:02 - INFO - __main__ - Generating videos for validation... +12/08/2025 19:14:02 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.79it/s] +12/08/2025 19:14:08 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 19:14:49 - INFO - __main__ - Step: 3010 Loss: 10.2929 LR: 0.000030 +12/08/2025 19:15:32 - INFO - __main__ - Step: 3020 Loss: 10.4008 LR: 0.000030 +12/08/2025 19:16:12 - INFO - __main__ - Step: 3030 Loss: 10.3054 LR: 0.000030 +12/08/2025 19:16:52 - INFO - __main__ - Step: 3040 Loss: 10.3365 LR: 0.000030 +12/08/2025 19:17:36 - INFO - __main__ - Step: 3050 Loss: 10.3967 LR: 0.000030 +12/08/2025 19:18:16 - INFO - __main__ - Step: 3060 Loss: 10.2645 LR: 0.000030 +12/08/2025 19:18:55 - INFO - __main__ - Step: 3070 Loss: 10.1784 LR: 0.000030 +12/08/2025 19:19:35 - INFO - __main__ - Step: 3080 Loss: 9.6810 LR: 0.000030 +12/08/2025 19:20:15 - INFO - __main__ - Step: 3090 Loss: 10.3777 LR: 0.000030 +12/08/2025 19:20:55 - INFO - __main__ - Step: 3100 Loss: 10.3118 LR: 0.000030 +12/08/2025 19:21:36 - INFO - __main__ - Step: 3110 Loss: 10.3528 LR: 0.000030 +12/08/2025 19:22:21 - INFO - __main__ - Step: 3120 Loss: 10.2164 LR: 0.000030 +12/08/2025 19:23:01 - INFO - __main__ - Step: 3130 Loss: 10.3696 LR: 0.000030 +12/08/2025 19:23:42 - INFO - __main__ - Step: 3140 Loss: 10.2756 LR: 0.000030 +12/08/2025 19:24:21 - INFO - __main__ - Step: 3150 Loss: 10.2743 LR: 0.000030 +12/08/2025 19:25:02 - INFO - __main__ - Step: 3160 Loss: 10.4775 LR: 0.000030 +12/08/2025 19:25:42 - INFO - __main__ - Step: 3170 Loss: 10.3352 LR: 0.000030 +12/08/2025 19:26:23 - INFO - __main__ - Step: 3180 Loss: 10.2950 LR: 0.000030 +12/08/2025 19:27:04 - INFO - __main__ - Step: 3190 Loss: 10.3197 LR: 0.000030 +12/08/2025 19:27:44 - INFO - __main__ - Step: 3200 Loss: 10.4200 LR: 0.000030 +12/08/2025 19:28:25 - INFO - __main__ - Step: 3210 Loss: 10.4490 LR: 0.000030 +12/08/2025 19:29:05 - INFO - __main__ - Step: 3220 Loss: 10.2776 LR: 0.000030 +12/08/2025 19:29:44 - INFO - __main__ - Step: 3230 Loss: 10.3765 LR: 0.000030 +12/08/2025 19:30:25 - INFO - __main__ - Step: 3240 Loss: 10.4211 LR: 0.000030 +12/08/2025 19:31:04 - INFO - __main__ - Step: 3250 Loss: 10.1657 LR: 0.000030 +12/08/2025 19:31:44 - INFO - __main__ - Step: 3260 Loss: 10.3544 LR: 0.000030 +12/08/2025 19:32:25 - INFO - __main__ - Step: 3270 Loss: 10.3862 LR: 0.000030 +12/08/2025 19:33:06 - INFO - __main__ - Step: 3280 Loss: 10.3992 LR: 0.000030 +12/08/2025 19:33:47 - INFO - __main__ - Step: 3290 Loss: 10.3372 LR: 0.000030 +12/08/2025 19:34:28 - INFO - __main__ - Step: 3300 Loss: 10.2070 LR: 0.000030 +12/08/2025 19:35:08 - INFO - __main__ - Step: 3310 Loss: 10.2486 LR: 0.000030 +12/08/2025 19:35:50 - INFO - __main__ - Step: 3320 Loss: 10.4331 LR: 0.000030 +12/08/2025 19:36:31 - INFO - __main__ - Step: 3330 Loss: 10.2179 LR: 0.000030 +12/08/2025 19:37:11 - INFO - __main__ - Step: 3340 Loss: 10.3287 LR: 0.000030 +12/08/2025 19:37:52 - INFO - __main__ - Step: 3350 Loss: 10.2189 LR: 0.000030 +12/08/2025 19:38:31 - INFO - __main__ - Step: 3360 Loss: 10.3541 LR: 0.000030 +12/08/2025 19:39:11 - INFO - __main__ - Step: 3370 Loss: 10.1355 LR: 0.000030 +12/08/2025 19:39:51 - INFO - __main__ - Step: 3380 Loss: 10.1508 LR: 0.000030 +12/08/2025 19:40:31 - INFO - __main__ - Step: 3390 Loss: 10.2778 LR: 0.000030 +12/08/2025 19:41:12 - INFO - __main__ - Step: 3400 Loss: 10.2547 LR: 0.000030 +12/08/2025 19:41:52 - INFO - __main__ - Step: 3410 Loss: 10.2936 LR: 0.000030 +12/08/2025 19:42:34 - INFO - __main__ - Step: 3420 Loss: 10.2440 LR: 0.000030 +12/08/2025 19:43:14 - INFO - __main__ - Step: 3430 Loss: 10.2664 LR: 0.000030 +12/08/2025 19:43:54 - INFO - __main__ - Step: 3440 Loss: 10.2321 LR: 0.000030 +12/08/2025 19:44:34 - INFO - __main__ - Step: 3450 Loss: 10.2659 LR: 0.000030 +12/08/2025 19:45:14 - INFO - __main__ - Step: 3460 Loss: 10.4566 LR: 0.000030 +12/08/2025 19:45:55 - INFO - __main__ - Step: 3470 Loss: 10.2818 LR: 0.000030 +12/08/2025 19:46:36 - INFO - __main__ - Step: 3480 Loss: 10.3455 LR: 0.000030 +12/08/2025 19:47:17 - INFO - __main__ - Step: 3490 Loss: 10.3028 LR: 0.000030 +12/08/2025 19:47:56 - INFO - __main__ - Step: 3500 Loss: 10.3447 LR: 0.000030 +12/08/2025 19:47:56 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500 +12/08/2025 19:48:05 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/optimizer.bin +12/08/2025 19:48:05 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/scheduler.bin +12/08/2025 19:48:05 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/sampler.bin +12/08/2025 19:48:05 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500/random_states_0.pkl +12/08/2025 19:48:05 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-3500 +12/08/2025 19:48:05 - INFO - __main__ - Generating videos for validation... +12/08/2025 19:48:05 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.86it/s] +12/08/2025 19:48:11 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 19:48:51 - INFO - __main__ - Step: 3510 Loss: 10.4375 LR: 0.000030 +12/08/2025 19:49:30 - INFO - __main__ - Step: 3520 Loss: 10.2045 LR: 0.000030 +12/08/2025 19:50:11 - INFO - __main__ - Step: 3530 Loss: 10.3081 LR: 0.000030 +12/08/2025 19:50:52 - INFO - __main__ - Step: 3540 Loss: 10.3131 LR: 0.000030 +12/08/2025 19:51:32 - INFO - __main__ - Step: 3550 Loss: 10.3914 LR: 0.000030 +12/08/2025 19:52:13 - INFO - __main__ - Step: 3560 Loss: 10.2767 LR: 0.000030 +12/08/2025 19:52:54 - INFO - __main__ - Step: 3570 Loss: 10.3079 LR: 0.000030 +12/08/2025 19:53:34 - INFO - __main__ - Step: 3580 Loss: 10.3233 LR: 0.000030 +12/08/2025 19:54:16 - INFO - __main__ - Step: 3590 Loss: 10.3012 LR: 0.000030 +12/08/2025 19:54:57 - INFO - __main__ - Step: 3600 Loss: 10.3112 LR: 0.000030 +12/08/2025 19:55:38 - INFO - __main__ - Step: 3610 Loss: 10.2488 LR: 0.000030 +12/08/2025 19:56:21 - INFO - __main__ - Step: 3620 Loss: 10.3730 LR: 0.000030 +12/08/2025 19:57:02 - INFO - __main__ - Step: 3630 Loss: 10.3894 LR: 0.000030 +12/08/2025 19:57:42 - INFO - __main__ - Step: 3640 Loss: 10.1371 LR: 0.000030 +12/08/2025 19:58:24 - INFO - __main__ - Step: 3650 Loss: 10.3148 LR: 0.000030 +12/08/2025 19:59:06 - INFO - __main__ - Step: 3660 Loss: 10.2854 LR: 0.000030 +12/08/2025 19:59:47 - INFO - __main__ - Step: 3670 Loss: 10.2385 LR: 0.000030 +12/08/2025 20:00:29 - INFO - __main__ - Step: 3680 Loss: 10.0399 LR: 0.000030 +12/08/2025 20:01:10 - INFO - __main__ - Step: 3690 Loss: 10.1957 LR: 0.000030 +12/08/2025 20:01:52 - INFO - __main__ - Step: 3700 Loss: 10.3439 LR: 0.000030 +12/08/2025 20:02:32 - INFO - __main__ - Step: 3710 Loss: 10.3401 LR: 0.000030 +12/08/2025 20:03:14 - INFO - __main__ - Step: 3720 Loss: 10.2707 LR: 0.000030 +12/08/2025 20:03:55 - INFO - __main__ - Step: 3730 Loss: 9.9035 LR: 0.000030 +12/08/2025 20:04:38 - INFO - __main__ - Step: 3740 Loss: 10.2220 LR: 0.000030 +12/08/2025 20:05:19 - INFO - __main__ - Step: 3750 Loss: 10.2304 LR: 0.000030 +12/08/2025 20:06:01 - INFO - __main__ - Step: 3760 Loss: 10.3545 LR: 0.000030 +12/08/2025 20:06:43 - INFO - __main__ - Step: 3770 Loss: 10.2927 LR: 0.000030 +12/08/2025 20:07:24 - INFO - __main__ - Step: 3780 Loss: 10.2880 LR: 0.000030 +12/08/2025 20:08:06 - INFO - __main__ - Step: 3790 Loss: 10.2844 LR: 0.000030 +12/08/2025 20:08:48 - INFO - __main__ - Step: 3800 Loss: 10.4498 LR: 0.000030 +12/08/2025 20:09:31 - INFO - __main__ - Step: 3810 Loss: 10.2957 LR: 0.000030 +12/08/2025 20:10:13 - INFO - __main__ - Step: 3820 Loss: 10.2790 LR: 0.000030 +12/08/2025 20:10:54 - INFO - __main__ - Step: 3830 Loss: 10.2985 LR: 0.000030 +12/08/2025 20:11:34 - INFO - __main__ - Step: 3840 Loss: 10.2712 LR: 0.000030 +12/08/2025 20:12:16 - INFO - __main__ - Step: 3850 Loss: 10.2378 LR: 0.000030 +12/08/2025 20:12:57 - INFO - __main__ - Step: 3860 Loss: 10.3652 LR: 0.000030 +12/08/2025 20:13:39 - INFO - __main__ - Step: 3870 Loss: 10.2111 LR: 0.000030 +12/08/2025 20:14:19 - INFO - __main__ - Step: 3880 Loss: 10.3703 LR: 0.000030 +12/08/2025 20:15:02 - INFO - __main__ - Step: 3890 Loss: 10.0977 LR: 0.000030 +12/08/2025 20:15:44 - INFO - __main__ - Step: 3900 Loss: 10.3042 LR: 0.000030 +12/08/2025 20:16:26 - INFO - __main__ - Step: 3910 Loss: 10.3480 LR: 0.000030 +12/08/2025 20:17:08 - INFO - __main__ - Step: 3920 Loss: 10.2822 LR: 0.000030 +12/08/2025 20:17:49 - INFO - __main__ - Step: 3930 Loss: 10.3450 LR: 0.000030 +12/08/2025 20:18:31 - INFO - __main__ - Step: 3940 Loss: 10.0804 LR: 0.000030 +12/08/2025 20:19:12 - INFO - __main__ - Step: 3950 Loss: 10.3248 LR: 0.000030 +12/08/2025 20:19:53 - INFO - __main__ - Step: 3960 Loss: 10.1971 LR: 0.000030 +12/08/2025 20:20:34 - INFO - __main__ - Step: 3970 Loss: 10.2639 LR: 0.000030 +12/08/2025 20:21:14 - INFO - __main__ - Step: 3980 Loss: 10.3137 LR: 0.000030 +12/08/2025 20:21:55 - INFO - __main__ - Step: 3990 Loss: 10.2407 LR: 0.000030 +12/08/2025 20:22:36 - INFO - __main__ - Step: 4000 Loss: 10.2451 LR: 0.000030 +12/08/2025 20:22:36 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000 +12/08/2025 20:22:44 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/optimizer.bin +12/08/2025 20:22:44 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/scheduler.bin +12/08/2025 20:22:44 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/sampler.bin +12/08/2025 20:22:44 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000/random_states_0.pkl +12/08/2025 20:22:44 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4000 +12/08/2025 20:22:44 - INFO - __main__ - Generating videos for validation... +12/08/2025 20:22:44 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.75it/s] +12/08/2025 20:22:50 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 20:23:31 - INFO - __main__ - Step: 4010 Loss: 10.2899 LR: 0.000030 +12/08/2025 20:24:11 - INFO - __main__ - Step: 4020 Loss: 10.3379 LR: 0.000030 +12/08/2025 20:24:51 - INFO - __main__ - Step: 4030 Loss: 10.2633 LR: 0.000030 +12/08/2025 20:25:34 - INFO - __main__ - Step: 4040 Loss: 10.3699 LR: 0.000030 +12/08/2025 20:26:13 - INFO - __main__ - Step: 4050 Loss: 10.2436 LR: 0.000030 +12/08/2025 20:26:54 - INFO - __main__ - Step: 4060 Loss: 10.1246 LR: 0.000030 +12/08/2025 20:27:37 - INFO - __main__ - Step: 4070 Loss: 10.3255 LR: 0.000030 +12/08/2025 20:28:18 - INFO - __main__ - Step: 4080 Loss: 10.3316 LR: 0.000030 +12/08/2025 20:28:57 - INFO - __main__ - Step: 4090 Loss: 10.4003 LR: 0.000030 +12/08/2025 20:29:38 - INFO - __main__ - Step: 4100 Loss: 10.1379 LR: 0.000030 +12/08/2025 20:30:20 - INFO - __main__ - Step: 4110 Loss: 10.1628 LR: 0.000030 +12/08/2025 20:31:02 - INFO - __main__ - Step: 4120 Loss: 10.2809 LR: 0.000030 +12/08/2025 20:31:42 - INFO - __main__ - Step: 4130 Loss: 10.0747 LR: 0.000030 +12/08/2025 20:32:22 - INFO - __main__ - Step: 4140 Loss: 10.1733 LR: 0.000030 +12/08/2025 20:33:02 - INFO - __main__ - Step: 4150 Loss: 10.1063 LR: 0.000030 +12/08/2025 20:33:43 - INFO - __main__ - Step: 4160 Loss: 10.3310 LR: 0.000030 +12/08/2025 20:34:23 - INFO - __main__ - Step: 4170 Loss: 10.3512 LR: 0.000030 +12/08/2025 20:35:03 - INFO - __main__ - Step: 4180 Loss: 10.2131 LR: 0.000030 +12/08/2025 20:35:44 - INFO - __main__ - Step: 4190 Loss: 10.3586 LR: 0.000030 +12/08/2025 20:36:27 - INFO - __main__ - Step: 4200 Loss: 10.3207 LR: 0.000030 +12/08/2025 20:37:07 - INFO - __main__ - Step: 4210 Loss: 10.3537 LR: 0.000030 +12/08/2025 20:37:47 - INFO - __main__ - Step: 4220 Loss: 10.1798 LR: 0.000030 +12/08/2025 20:38:29 - INFO - __main__ - Step: 4230 Loss: 10.2969 LR: 0.000030 +12/08/2025 20:39:09 - INFO - __main__ - Step: 4240 Loss: 10.1916 LR: 0.000030 +12/08/2025 20:39:50 - INFO - __main__ - Step: 4250 Loss: 10.2351 LR: 0.000030 +12/08/2025 20:40:30 - INFO - __main__ - Step: 4260 Loss: 10.4348 LR: 0.000030 +12/08/2025 20:41:11 - INFO - __main__ - Step: 4270 Loss: 10.2058 LR: 0.000030 +12/08/2025 20:41:52 - INFO - __main__ - Step: 4280 Loss: 10.4054 LR: 0.000030 +12/08/2025 20:42:33 - INFO - __main__ - Step: 4290 Loss: 10.2221 LR: 0.000030 +12/08/2025 20:43:13 - INFO - __main__ - Step: 4300 Loss: 10.3184 LR: 0.000030 +12/08/2025 20:43:56 - INFO - __main__ - Step: 4310 Loss: 10.0380 LR: 0.000030 +12/08/2025 20:44:37 - INFO - __main__ - Step: 4320 Loss: 10.1752 LR: 0.000030 +12/08/2025 20:45:18 - INFO - __main__ - Step: 4330 Loss: 9.9107 LR: 0.000030 +12/08/2025 20:45:59 - INFO - __main__ - Step: 4340 Loss: 10.2164 LR: 0.000030 +12/08/2025 20:46:42 - INFO - __main__ - Step: 4350 Loss: 10.2119 LR: 0.000030 +12/08/2025 20:47:22 - INFO - __main__ - Step: 4360 Loss: 10.1768 LR: 0.000030 +12/08/2025 20:48:05 - INFO - __main__ - Step: 4370 Loss: 10.2246 LR: 0.000030 +12/08/2025 20:48:45 - INFO - __main__ - Step: 4380 Loss: 10.3193 LR: 0.000030 +12/08/2025 20:49:28 - INFO - __main__ - Step: 4390 Loss: 10.2580 LR: 0.000030 +12/08/2025 20:50:10 - INFO - __main__ - Step: 4400 Loss: 10.1080 LR: 0.000030 +12/08/2025 20:50:52 - INFO - __main__ - Step: 4410 Loss: 10.3309 LR: 0.000030 +12/08/2025 20:51:35 - INFO - __main__ - Step: 4420 Loss: 10.2459 LR: 0.000030 +12/08/2025 20:52:17 - INFO - __main__ - Step: 4430 Loss: 10.3830 LR: 0.000030 +12/08/2025 20:52:58 - INFO - __main__ - Step: 4440 Loss: 10.2223 LR: 0.000030 +12/08/2025 20:53:40 - INFO - __main__ - Step: 4450 Loss: 10.2005 LR: 0.000030 +12/08/2025 20:54:23 - INFO - __main__ - Step: 4460 Loss: 10.2870 LR: 0.000030 +12/08/2025 20:55:07 - INFO - __main__ - Step: 4470 Loss: 10.2336 LR: 0.000030 +12/08/2025 20:55:48 - INFO - __main__ - Step: 4480 Loss: 10.1000 LR: 0.000030 +12/08/2025 20:56:31 - INFO - __main__ - Step: 4490 Loss: 10.2858 LR: 0.000030 +12/08/2025 20:57:13 - INFO - __main__ - Step: 4500 Loss: 10.4108 LR: 0.000030 +12/08/2025 20:57:13 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500 +12/08/2025 20:57:22 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/optimizer.bin +12/08/2025 20:57:22 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/scheduler.bin +12/08/2025 20:57:22 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/sampler.bin +12/08/2025 20:57:22 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500/random_states_0.pkl +12/08/2025 20:57:22 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-4500 +12/08/2025 20:57:22 - INFO - __main__ - Generating videos for validation... +12/08/2025 20:57:22 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.80it/s] +12/08/2025 20:57:29 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 20:58:09 - INFO - __main__ - Step: 4510 Loss: 10.2823 LR: 0.000030 +12/08/2025 20:58:51 - INFO - __main__ - Step: 4520 Loss: 10.1449 LR: 0.000030 +12/08/2025 20:59:32 - INFO - __main__ - Step: 4530 Loss: 10.3825 LR: 0.000030 +12/08/2025 21:00:13 - INFO - __main__ - Step: 4540 Loss: 10.0908 LR: 0.000030 +12/08/2025 21:00:56 - INFO - __main__ - Step: 4550 Loss: 10.2845 LR: 0.000030 +12/08/2025 21:01:43 - INFO - __main__ - Step: 4560 Loss: 10.1658 LR: 0.000030 +12/08/2025 21:02:26 - INFO - __main__ - Step: 4570 Loss: 10.3447 LR: 0.000030 +12/08/2025 21:03:07 - INFO - __main__ - Step: 4580 Loss: 10.1652 LR: 0.000030 +12/08/2025 21:03:48 - INFO - __main__ - Step: 4590 Loss: 10.2222 LR: 0.000030 +12/08/2025 21:04:30 - INFO - __main__ - Step: 4600 Loss: 10.0792 LR: 0.000030 +12/08/2025 21:05:13 - INFO - __main__ - Step: 4610 Loss: 10.2119 LR: 0.000030 +12/08/2025 21:05:54 - INFO - __main__ - Step: 4620 Loss: 10.0849 LR: 0.000030 +12/08/2025 21:06:37 - INFO - __main__ - Step: 4630 Loss: 10.0546 LR: 0.000030 +12/08/2025 21:07:19 - INFO - __main__ - Step: 4640 Loss: 10.2588 LR: 0.000030 +12/08/2025 21:08:01 - INFO - __main__ - Step: 4650 Loss: 10.3376 LR: 0.000030 +12/08/2025 21:08:43 - INFO - __main__ - Step: 4660 Loss: 10.2977 LR: 0.000030 +12/08/2025 21:09:25 - INFO - __main__ - Step: 4670 Loss: 10.3725 LR: 0.000030 +12/08/2025 21:10:07 - INFO - __main__ - Step: 4680 Loss: 10.2226 LR: 0.000030 +12/08/2025 21:10:49 - INFO - __main__ - Step: 4690 Loss: 10.3483 LR: 0.000030 +12/08/2025 21:11:31 - INFO - __main__ - Step: 4700 Loss: 10.3505 LR: 0.000030 +12/08/2025 21:12:12 - INFO - __main__ - Step: 4710 Loss: 10.3326 LR: 0.000030 +12/08/2025 21:12:53 - INFO - __main__ - Step: 4720 Loss: 10.3280 LR: 0.000030 +12/08/2025 21:13:36 - INFO - __main__ - Step: 4730 Loss: 10.1439 LR: 0.000030 +12/08/2025 21:14:17 - INFO - __main__ - Step: 4740 Loss: 10.3052 LR: 0.000030 +12/08/2025 21:14:59 - INFO - __main__ - Step: 4750 Loss: 10.3487 LR: 0.000030 +12/08/2025 21:15:41 - INFO - __main__ - Step: 4760 Loss: 10.2699 LR: 0.000030 +12/08/2025 21:16:23 - INFO - __main__ - Step: 4770 Loss: 10.2466 LR: 0.000030 +12/08/2025 21:17:05 - INFO - __main__ - Step: 4780 Loss: 10.2666 LR: 0.000030 +12/08/2025 21:17:47 - INFO - __main__ - Step: 4790 Loss: 9.8289 LR: 0.000030 +12/08/2025 21:18:28 - INFO - __main__ - Step: 4800 Loss: 10.2504 LR: 0.000030 +12/08/2025 21:19:11 - INFO - __main__ - Step: 4810 Loss: 10.3114 LR: 0.000030 +12/08/2025 21:19:52 - INFO - __main__ - Step: 4820 Loss: 10.2730 LR: 0.000030 +12/08/2025 21:20:34 - INFO - __main__ - Step: 4830 Loss: 10.1306 LR: 0.000030 +12/08/2025 21:21:16 - INFO - __main__ - Step: 4840 Loss: 10.3507 LR: 0.000030 +12/08/2025 21:21:59 - INFO - __main__ - Step: 4850 Loss: 10.1111 LR: 0.000030 +12/08/2025 21:22:41 - INFO - __main__ - Step: 4860 Loss: 10.3416 LR: 0.000030 +12/08/2025 21:23:23 - INFO - __main__ - Step: 4870 Loss: 10.1908 LR: 0.000030 +12/08/2025 21:24:05 - INFO - __main__ - Step: 4880 Loss: 10.3423 LR: 0.000030 +12/08/2025 21:24:47 - INFO - __main__ - Step: 4890 Loss: 10.2651 LR: 0.000030 +12/08/2025 21:25:29 - INFO - __main__ - Step: 4900 Loss: 10.3409 LR: 0.000030 +12/08/2025 21:26:10 - INFO - __main__ - Step: 4910 Loss: 9.9730 LR: 0.000030 +12/08/2025 21:26:52 - INFO - __main__ - Step: 4920 Loss: 10.2333 LR: 0.000030 +12/08/2025 21:27:34 - INFO - __main__ - Step: 4930 Loss: 10.1536 LR: 0.000030 +12/08/2025 21:28:16 - INFO - __main__ - Step: 4940 Loss: 10.2931 LR: 0.000030 +12/08/2025 21:28:57 - INFO - __main__ - Step: 4950 Loss: 10.3159 LR: 0.000030 +12/08/2025 21:29:39 - INFO - __main__ - Step: 4960 Loss: 10.0516 LR: 0.000030 +12/08/2025 21:30:20 - INFO - __main__ - Step: 4970 Loss: 10.1747 LR: 0.000030 +12/08/2025 21:31:01 - INFO - __main__ - Step: 4980 Loss: 10.3765 LR: 0.000030 +12/08/2025 21:31:43 - INFO - __main__ - Step: 4990 Loss: 10.2803 LR: 0.000030 +12/08/2025 21:32:25 - INFO - __main__ - Step: 5000 Loss: 10.3450 LR: 0.000030 +12/08/2025 21:32:25 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000 +12/08/2025 21:32:33 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/optimizer.bin +12/08/2025 21:32:33 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/scheduler.bin +12/08/2025 21:32:33 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/sampler.bin +12/08/2025 21:32:33 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000/random_states_0.pkl +12/08/2025 21:32:33 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5000 +12/08/2025 21:32:33 - INFO - __main__ - Generating videos for validation... +12/08/2025 21:32:33 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.71it/s] +12/08/2025 21:32:39 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 21:33:21 - INFO - __main__ - Step: 5010 Loss: 10.3345 LR: 0.000030 +12/08/2025 21:34:03 - INFO - __main__ - Step: 5020 Loss: 10.3916 LR: 0.000030 +12/08/2025 21:34:45 - INFO - __main__ - Step: 5030 Loss: 10.2735 LR: 0.000030 +12/08/2025 21:35:27 - INFO - __main__ - Step: 5040 Loss: 10.1283 LR: 0.000030 +12/08/2025 21:36:09 - INFO - __main__ - Step: 5050 Loss: 10.2323 LR: 0.000030 +12/08/2025 21:36:50 - INFO - __main__ - Step: 5060 Loss: 10.3613 LR: 0.000030 +12/08/2025 21:37:32 - INFO - __main__ - Step: 5070 Loss: 10.2081 LR: 0.000030 +12/08/2025 21:38:13 - INFO - __main__ - Step: 5080 Loss: 10.2587 LR: 0.000030 +12/08/2025 21:38:55 - INFO - __main__ - Step: 5090 Loss: 10.2870 LR: 0.000030 +12/08/2025 21:39:38 - INFO - __main__ - Step: 5100 Loss: 10.1199 LR: 0.000030 +12/08/2025 21:40:20 - INFO - __main__ - Step: 5110 Loss: 10.2896 LR: 0.000030 +12/08/2025 21:41:04 - INFO - __main__ - Step: 5120 Loss: 10.3799 LR: 0.000030 +12/08/2025 21:41:45 - INFO - __main__ - Step: 5130 Loss: 10.3204 LR: 0.000030 +12/08/2025 21:42:27 - INFO - __main__ - Step: 5140 Loss: 10.2922 LR: 0.000030 +12/08/2025 21:43:09 - INFO - __main__ - Step: 5150 Loss: 9.9432 LR: 0.000030 +12/08/2025 21:43:51 - INFO - __main__ - Step: 5160 Loss: 10.3235 LR: 0.000030 +12/08/2025 21:44:33 - INFO - __main__ - Step: 5170 Loss: 10.3383 LR: 0.000030 +12/08/2025 21:45:14 - INFO - __main__ - Step: 5180 Loss: 10.1858 LR: 0.000030 +12/08/2025 21:45:55 - INFO - __main__ - Step: 5190 Loss: 10.3718 LR: 0.000030 +12/08/2025 21:46:36 - INFO - __main__ - Step: 5200 Loss: 10.1364 LR: 0.000030 +12/08/2025 21:47:18 - INFO - __main__ - Step: 5210 Loss: 10.1594 LR: 0.000030 +12/08/2025 21:47:58 - INFO - __main__ - Step: 5220 Loss: 10.2590 LR: 0.000030 +12/08/2025 21:48:38 - INFO - __main__ - Step: 5230 Loss: 10.1180 LR: 0.000030 +12/08/2025 21:49:19 - INFO - __main__ - Step: 5240 Loss: 10.3417 LR: 0.000030 +12/08/2025 21:50:01 - INFO - __main__ - Step: 5250 Loss: 10.2217 LR: 0.000030 +12/08/2025 21:50:42 - INFO - __main__ - Step: 5260 Loss: 10.1060 LR: 0.000030 +12/08/2025 21:51:24 - INFO - __main__ - Step: 5270 Loss: 10.0291 LR: 0.000030 +12/08/2025 21:52:05 - INFO - __main__ - Step: 5280 Loss: 10.2215 LR: 0.000030 +12/08/2025 21:52:46 - INFO - __main__ - Step: 5290 Loss: 10.3945 LR: 0.000030 +12/08/2025 21:53:28 - INFO - __main__ - Step: 5300 Loss: 10.0282 LR: 0.000030 +12/08/2025 21:54:09 - INFO - __main__ - Step: 5310 Loss: 10.3425 LR: 0.000030 +12/08/2025 21:54:51 - INFO - __main__ - Step: 5320 Loss: 9.9285 LR: 0.000030 +12/08/2025 21:55:32 - INFO - __main__ - Step: 5330 Loss: 10.2162 LR: 0.000030 +12/08/2025 21:56:14 - INFO - __main__ - Step: 5340 Loss: 10.3321 LR: 0.000030 +12/08/2025 21:56:56 - INFO - __main__ - Step: 5350 Loss: 10.0870 LR: 0.000030 +12/08/2025 21:57:38 - INFO - __main__ - Step: 5360 Loss: 10.0999 LR: 0.000030 +12/08/2025 21:58:20 - INFO - __main__ - Step: 5370 Loss: 10.2815 LR: 0.000030 +12/08/2025 21:59:01 - INFO - __main__ - Step: 5380 Loss: 10.3108 LR: 0.000030 +12/08/2025 21:59:43 - INFO - __main__ - Step: 5390 Loss: 10.2583 LR: 0.000030 +12/08/2025 22:00:25 - INFO - __main__ - Step: 5400 Loss: 10.2013 LR: 0.000030 +12/08/2025 22:01:07 - INFO - __main__ - Step: 5410 Loss: 10.2130 LR: 0.000030 +12/08/2025 22:01:49 - INFO - __main__ - Step: 5420 Loss: 10.0639 LR: 0.000030 +12/08/2025 22:02:31 - INFO - __main__ - Step: 5430 Loss: 10.3222 LR: 0.000030 +12/08/2025 22:03:12 - INFO - __main__ - Step: 5440 Loss: 10.4058 LR: 0.000030 +12/08/2025 22:03:55 - INFO - __main__ - Step: 5450 Loss: 10.1913 LR: 0.000030 +12/08/2025 22:04:36 - INFO - __main__ - Step: 5460 Loss: 10.1779 LR: 0.000030 +12/08/2025 22:05:17 - INFO - __main__ - Step: 5470 Loss: 10.3779 LR: 0.000030 +12/08/2025 22:05:59 - INFO - __main__ - Step: 5480 Loss: 10.1930 LR: 0.000030 +12/08/2025 22:06:42 - INFO - __main__ - Step: 5490 Loss: 10.3855 LR: 0.000030 +12/08/2025 22:07:24 - INFO - __main__ - Step: 5500 Loss: 10.2458 LR: 0.000030 +12/08/2025 22:07:24 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500 +12/08/2025 22:07:31 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/optimizer.bin +12/08/2025 22:07:31 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/scheduler.bin +12/08/2025 22:07:31 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/sampler.bin +12/08/2025 22:07:31 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500/random_states_0.pkl +12/08/2025 22:07:31 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-5500 +12/08/2025 22:07:31 - INFO - __main__ - Generating videos for validation... +12/08/2025 22:07:31 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.90it/s] +12/08/2025 22:07:37 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 22:08:19 - INFO - __main__ - Step: 5510 Loss: 10.4149 LR: 0.000030 +12/08/2025 22:09:02 - INFO - __main__ - Step: 5520 Loss: 10.2892 LR: 0.000030 +12/08/2025 22:09:44 - INFO - __main__ - Step: 5530 Loss: 10.3342 LR: 0.000030 +12/08/2025 22:10:27 - INFO - __main__ - Step: 5540 Loss: 10.3278 LR: 0.000030 +12/08/2025 22:11:11 - INFO - __main__ - Step: 5550 Loss: 10.3007 LR: 0.000030 +12/08/2025 22:11:53 - INFO - __main__ - Step: 5560 Loss: 9.9747 LR: 0.000030 +12/08/2025 22:12:34 - INFO - __main__ - Step: 5570 Loss: 9.9621 LR: 0.000030 +12/08/2025 22:13:17 - INFO - __main__ - Step: 5580 Loss: 10.1569 LR: 0.000030 +12/08/2025 22:13:59 - INFO - __main__ - Step: 5590 Loss: 10.2479 LR: 0.000030 +12/08/2025 22:14:41 - INFO - __main__ - Step: 5600 Loss: 10.2491 LR: 0.000030 +12/08/2025 22:15:23 - INFO - __main__ - Step: 5610 Loss: 10.1147 LR: 0.000030 +12/08/2025 22:16:05 - INFO - __main__ - Step: 5620 Loss: 10.3269 LR: 0.000030 +12/08/2025 22:16:48 - INFO - __main__ - Step: 5630 Loss: 10.0841 LR: 0.000030 +12/08/2025 22:17:30 - INFO - __main__ - Step: 5640 Loss: 10.3737 LR: 0.000030 +12/08/2025 22:18:12 - INFO - __main__ - Step: 5650 Loss: 10.3548 LR: 0.000030 +12/08/2025 22:18:54 - INFO - __main__ - Step: 5660 Loss: 10.2859 LR: 0.000030 +12/08/2025 22:19:37 - INFO - __main__ - Step: 5670 Loss: 10.1853 LR: 0.000030 +12/08/2025 22:20:19 - INFO - __main__ - Step: 5680 Loss: 10.3632 LR: 0.000030 +12/08/2025 22:21:03 - INFO - __main__ - Step: 5690 Loss: 10.2320 LR: 0.000030 +12/08/2025 22:21:45 - INFO - __main__ - Step: 5700 Loss: 10.2409 LR: 0.000030 +12/08/2025 22:22:27 - INFO - __main__ - Step: 5710 Loss: 10.3355 LR: 0.000030 +12/08/2025 22:23:10 - INFO - __main__ - Step: 5720 Loss: 10.2148 LR: 0.000030 +12/08/2025 22:23:52 - INFO - __main__ - Step: 5730 Loss: 10.2750 LR: 0.000030 +12/08/2025 22:24:33 - INFO - __main__ - Step: 5740 Loss: 10.2772 LR: 0.000030 +12/08/2025 22:25:15 - INFO - __main__ - Step: 5750 Loss: 10.1679 LR: 0.000030 +12/08/2025 22:25:56 - INFO - __main__ - Step: 5760 Loss: 10.2853 LR: 0.000030 +12/08/2025 22:26:39 - INFO - __main__ - Step: 5770 Loss: 10.3320 LR: 0.000030 +12/08/2025 22:27:22 - INFO - __main__ - Step: 5780 Loss: 10.3134 LR: 0.000030 +12/08/2025 22:28:04 - INFO - __main__ - Step: 5790 Loss: 10.3775 LR: 0.000030 +12/08/2025 22:28:47 - INFO - __main__ - Step: 5800 Loss: 10.1853 LR: 0.000030 +12/08/2025 22:29:27 - INFO - __main__ - Step: 5810 Loss: 10.2957 LR: 0.000030 +12/08/2025 22:30:10 - INFO - __main__ - Step: 5820 Loss: 10.2954 LR: 0.000030 +12/08/2025 22:30:52 - INFO - __main__ - Step: 5830 Loss: 9.9769 LR: 0.000030 +12/08/2025 22:31:34 - INFO - __main__ - Step: 5840 Loss: 10.1726 LR: 0.000030 +12/08/2025 22:32:16 - INFO - __main__ - Step: 5850 Loss: 10.2430 LR: 0.000030 +12/08/2025 22:32:59 - INFO - __main__ - Step: 5860 Loss: 10.3791 LR: 0.000030 +12/08/2025 22:33:41 - INFO - __main__ - Step: 5870 Loss: 10.2243 LR: 0.000030 +12/08/2025 22:34:23 - INFO - __main__ - Step: 5880 Loss: 10.3128 LR: 0.000030 +12/08/2025 22:35:06 - INFO - __main__ - Step: 5890 Loss: 10.3263 LR: 0.000030 +12/08/2025 22:35:47 - INFO - __main__ - Step: 5900 Loss: 10.2545 LR: 0.000030 +12/08/2025 22:36:29 - INFO - __main__ - Step: 5910 Loss: 10.4141 LR: 0.000030 +12/08/2025 22:37:11 - INFO - __main__ - Step: 5920 Loss: 10.2515 LR: 0.000030 +12/08/2025 22:37:53 - INFO - __main__ - Step: 5930 Loss: 10.2135 LR: 0.000030 +12/08/2025 22:38:35 - INFO - __main__ - Step: 5940 Loss: 10.2881 LR: 0.000030 +12/08/2025 22:39:18 - INFO - __main__ - Step: 5950 Loss: 10.1997 LR: 0.000030 +12/08/2025 22:40:01 - INFO - __main__ - Step: 5960 Loss: 10.2877 LR: 0.000030 +12/08/2025 22:40:43 - INFO - __main__ - Step: 5970 Loss: 10.0559 LR: 0.000030 +12/08/2025 22:41:26 - INFO - __main__ - Step: 5980 Loss: 10.1773 LR: 0.000030 +12/08/2025 22:42:07 - INFO - __main__ - Step: 5990 Loss: 10.1486 LR: 0.000030 +12/08/2025 22:42:50 - INFO - __main__ - Step: 6000 Loss: 10.3314 LR: 0.000030 +12/08/2025 22:42:50 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000 +12/08/2025 22:42:57 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/optimizer.bin +12/08/2025 22:42:57 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/scheduler.bin +12/08/2025 22:42:57 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/sampler.bin +12/08/2025 22:42:57 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000/random_states_0.pkl +12/08/2025 22:42:57 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6000 +12/08/2025 22:42:57 - INFO - __main__ - Generating videos for validation... +12/08/2025 22:42:57 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.91it/s] +12/08/2025 22:43:04 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 22:43:46 - INFO - __main__ - Step: 6010 Loss: 10.1129 LR: 0.000030 +12/08/2025 22:44:27 - INFO - __main__ - Step: 6020 Loss: 10.2666 LR: 0.000030 +12/08/2025 22:45:12 - INFO - __main__ - Step: 6030 Loss: 10.3086 LR: 0.000030 +12/08/2025 22:45:54 - INFO - __main__ - Step: 6040 Loss: 10.2977 LR: 0.000030 +12/08/2025 22:46:35 - INFO - __main__ - Step: 6050 Loss: 10.2531 LR: 0.000030 +12/08/2025 22:47:17 - INFO - __main__ - Step: 6060 Loss: 10.1809 LR: 0.000030 +12/08/2025 22:47:59 - INFO - __main__ - Step: 6070 Loss: 10.1784 LR: 0.000030 +12/08/2025 22:48:40 - INFO - __main__ - Step: 6080 Loss: 10.3065 LR: 0.000030 +12/08/2025 22:49:22 - INFO - __main__ - Step: 6090 Loss: 10.3134 LR: 0.000030 +12/08/2025 22:50:04 - INFO - __main__ - Step: 6100 Loss: 10.3101 LR: 0.000030 +12/08/2025 22:50:46 - INFO - __main__ - Step: 6110 Loss: 10.2897 LR: 0.000030 +12/08/2025 22:51:28 - INFO - __main__ - Step: 6120 Loss: 10.1782 LR: 0.000030 +12/08/2025 22:52:11 - INFO - __main__ - Step: 6130 Loss: 10.2543 LR: 0.000030 +12/08/2025 22:52:53 - INFO - __main__ - Step: 6140 Loss: 10.2993 LR: 0.000030 +12/08/2025 22:53:35 - INFO - __main__ - Step: 6150 Loss: 10.2649 LR: 0.000030 +12/08/2025 22:54:16 - INFO - __main__ - Step: 6160 Loss: 10.1428 LR: 0.000030 +12/08/2025 22:54:58 - INFO - __main__ - Step: 6170 Loss: 10.2793 LR: 0.000030 +12/08/2025 22:55:40 - INFO - __main__ - Step: 6180 Loss: 10.2793 LR: 0.000030 +12/08/2025 22:56:21 - INFO - __main__ - Step: 6190 Loss: 10.2194 LR: 0.000030 +12/08/2025 22:57:03 - INFO - __main__ - Step: 6200 Loss: 10.1877 LR: 0.000030 +12/08/2025 22:57:45 - INFO - __main__ - Step: 6210 Loss: 10.0722 LR: 0.000030 +12/08/2025 22:58:27 - INFO - __main__ - Step: 6220 Loss: 9.8783 LR: 0.000030 +12/08/2025 22:59:09 - INFO - __main__ - Step: 6230 Loss: 10.1823 LR: 0.000030 +12/08/2025 22:59:51 - INFO - __main__ - Step: 6240 Loss: 10.1875 LR: 0.000030 +12/08/2025 23:00:32 - INFO - __main__ - Step: 6250 Loss: 10.3461 LR: 0.000030 +12/08/2025 23:01:15 - INFO - __main__ - Step: 6260 Loss: 10.3449 LR: 0.000030 +12/08/2025 23:01:58 - INFO - __main__ - Step: 6270 Loss: 9.9969 LR: 0.000030 +12/08/2025 23:02:39 - INFO - __main__ - Step: 6280 Loss: 10.2543 LR: 0.000030 +12/08/2025 23:03:21 - INFO - __main__ - Step: 6290 Loss: 10.0740 LR: 0.000030 +12/08/2025 23:04:03 - INFO - __main__ - Step: 6300 Loss: 10.1394 LR: 0.000030 +12/08/2025 23:04:44 - INFO - __main__ - Step: 6310 Loss: 10.2553 LR: 0.000030 +12/08/2025 23:05:27 - INFO - __main__ - Step: 6320 Loss: 10.2410 LR: 0.000030 +12/08/2025 23:06:10 - INFO - __main__ - Step: 6330 Loss: 10.3733 LR: 0.000030 +12/08/2025 23:06:51 - INFO - __main__ - Step: 6340 Loss: 10.3512 LR: 0.000030 +12/08/2025 23:07:33 - INFO - __main__ - Step: 6350 Loss: 10.3197 LR: 0.000030 +12/08/2025 23:08:18 - INFO - __main__ - Step: 6360 Loss: 10.2109 LR: 0.000030 +12/08/2025 23:09:00 - INFO - __main__ - Step: 6370 Loss: 10.3247 LR: 0.000030 +12/08/2025 23:09:42 - INFO - __main__ - Step: 6380 Loss: 10.3582 LR: 0.000030 +12/08/2025 23:10:25 - INFO - __main__ - Step: 6390 Loss: 10.3242 LR: 0.000030 +12/08/2025 23:11:07 - INFO - __main__ - Step: 6400 Loss: 10.1779 LR: 0.000030 +12/08/2025 23:11:48 - INFO - __main__ - Step: 6410 Loss: 10.2039 LR: 0.000030 +12/08/2025 23:12:31 - INFO - __main__ - Step: 6420 Loss: 10.2187 LR: 0.000030 +12/08/2025 23:13:12 - INFO - __main__ - Step: 6430 Loss: 10.3584 LR: 0.000030 +12/08/2025 23:13:53 - INFO - __main__ - Step: 6440 Loss: 10.2263 LR: 0.000030 +12/08/2025 23:14:35 - INFO - __main__ - Step: 6450 Loss: 10.2296 LR: 0.000030 +12/08/2025 23:15:16 - INFO - __main__ - Step: 6460 Loss: 10.2907 LR: 0.000030 +12/08/2025 23:15:58 - INFO - __main__ - Step: 6470 Loss: 10.3953 LR: 0.000030 +12/08/2025 23:16:41 - INFO - __main__ - Step: 6480 Loss: 10.4121 LR: 0.000030 +12/08/2025 23:17:23 - INFO - __main__ - Step: 6490 Loss: 10.3521 LR: 0.000030 +12/08/2025 23:18:05 - INFO - __main__ - Step: 6500 Loss: 10.2929 LR: 0.000030 +12/08/2025 23:18:05 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500 +12/08/2025 23:18:15 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/optimizer.bin +12/08/2025 23:18:15 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/scheduler.bin +12/08/2025 23:18:15 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/sampler.bin +12/08/2025 23:18:15 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500/random_states_0.pkl +12/08/2025 23:18:15 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-6500 +12/08/2025 23:18:15 - INFO - __main__ - Generating videos for validation... +12/08/2025 23:18:15 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.64it/s] +12/08/2025 23:18:22 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 23:19:03 - INFO - __main__ - Step: 6510 Loss: 10.1292 LR: 0.000030 +12/08/2025 23:19:44 - INFO - __main__ - Step: 6520 Loss: 10.3520 LR: 0.000030 +12/08/2025 23:20:25 - INFO - __main__ - Step: 6530 Loss: 10.1364 LR: 0.000030 +12/08/2025 23:21:06 - INFO - __main__ - Step: 6540 Loss: 10.2199 LR: 0.000030 +12/08/2025 23:21:46 - INFO - __main__ - Step: 6550 Loss: 10.0978 LR: 0.000030 +12/08/2025 23:22:27 - INFO - __main__ - Step: 6560 Loss: 10.1592 LR: 0.000030 +12/08/2025 23:23:08 - INFO - __main__ - Step: 6570 Loss: 10.1996 LR: 0.000030 +12/08/2025 23:23:52 - INFO - __main__ - Step: 6580 Loss: 10.3180 LR: 0.000030 +12/08/2025 23:24:32 - INFO - __main__ - Step: 6590 Loss: 10.1698 LR: 0.000030 +12/08/2025 23:25:12 - INFO - __main__ - Step: 6600 Loss: 10.3018 LR: 0.000030 +12/08/2025 23:25:53 - INFO - __main__ - Step: 6610 Loss: 10.1288 LR: 0.000030 +12/08/2025 23:26:33 - INFO - __main__ - Step: 6620 Loss: 10.4338 LR: 0.000030 +12/08/2025 23:27:15 - INFO - __main__ - Step: 6630 Loss: 10.2878 LR: 0.000030 +12/08/2025 23:27:56 - INFO - __main__ - Step: 6640 Loss: 10.2953 LR: 0.000030 +12/08/2025 23:28:39 - INFO - __main__ - Step: 6650 Loss: 10.2604 LR: 0.000030 +12/08/2025 23:29:21 - INFO - __main__ - Step: 6660 Loss: 10.1341 LR: 0.000030 +12/08/2025 23:30:03 - INFO - __main__ - Step: 6670 Loss: 10.3370 LR: 0.000030 +12/08/2025 23:30:44 - INFO - __main__ - Step: 6680 Loss: 10.0830 LR: 0.000030 +12/08/2025 23:31:25 - INFO - __main__ - Step: 6690 Loss: 10.2127 LR: 0.000030 +12/08/2025 23:32:06 - INFO - __main__ - Step: 6700 Loss: 9.8881 LR: 0.000030 +12/08/2025 23:32:46 - INFO - __main__ - Step: 6710 Loss: 10.3647 LR: 0.000030 +12/08/2025 23:33:29 - INFO - __main__ - Step: 6720 Loss: 10.2329 LR: 0.000030 +12/08/2025 23:34:10 - INFO - __main__ - Step: 6730 Loss: 10.1342 LR: 0.000030 +12/08/2025 23:34:52 - INFO - __main__ - Step: 6740 Loss: 10.2022 LR: 0.000030 +12/08/2025 23:35:35 - INFO - __main__ - Step: 6750 Loss: 10.3053 LR: 0.000030 +12/08/2025 23:36:17 - INFO - __main__ - Step: 6760 Loss: 10.1120 LR: 0.000030 +12/08/2025 23:36:57 - INFO - __main__ - Step: 6770 Loss: 10.1048 LR: 0.000030 +12/08/2025 23:37:39 - INFO - __main__ - Step: 6780 Loss: 10.3502 LR: 0.000030 +12/08/2025 23:38:21 - INFO - __main__ - Step: 6790 Loss: 10.3046 LR: 0.000030 +12/08/2025 23:39:02 - INFO - __main__ - Step: 6800 Loss: 10.1877 LR: 0.000030 +12/08/2025 23:39:44 - INFO - __main__ - Step: 6810 Loss: 10.0637 LR: 0.000030 +12/08/2025 23:40:26 - INFO - __main__ - Step: 6820 Loss: 10.3750 LR: 0.000030 +12/08/2025 23:41:08 - INFO - __main__ - Step: 6830 Loss: 10.1803 LR: 0.000030 +12/08/2025 23:41:50 - INFO - __main__ - Step: 6840 Loss: 10.3268 LR: 0.000030 +12/08/2025 23:42:33 - INFO - __main__ - Step: 6850 Loss: 10.1543 LR: 0.000030 +12/08/2025 23:43:16 - INFO - __main__ - Step: 6860 Loss: 10.2235 LR: 0.000030 +12/08/2025 23:43:57 - INFO - __main__ - Step: 6870 Loss: 10.1485 LR: 0.000030 +12/08/2025 23:44:38 - INFO - __main__ - Step: 6880 Loss: 10.1428 LR: 0.000030 +12/08/2025 23:45:19 - INFO - __main__ - Step: 6890 Loss: 10.3582 LR: 0.000030 +12/08/2025 23:46:02 - INFO - __main__ - Step: 6900 Loss: 10.2796 LR: 0.000030 +12/08/2025 23:46:43 - INFO - __main__ - Step: 6910 Loss: 10.2365 LR: 0.000030 +12/08/2025 23:47:26 - INFO - __main__ - Step: 6920 Loss: 10.1180 LR: 0.000030 +12/08/2025 23:48:08 - INFO - __main__ - Step: 6930 Loss: 10.2786 LR: 0.000030 +12/08/2025 23:48:50 - INFO - __main__ - Step: 6940 Loss: 10.3998 LR: 0.000030 +12/08/2025 23:49:33 - INFO - __main__ - Step: 6950 Loss: 10.3113 LR: 0.000030 +12/08/2025 23:50:15 - INFO - __main__ - Step: 6960 Loss: 10.3230 LR: 0.000030 +12/08/2025 23:50:57 - INFO - __main__ - Step: 6970 Loss: 10.3072 LR: 0.000030 +12/08/2025 23:51:39 - INFO - __main__ - Step: 6980 Loss: 10.1016 LR: 0.000030 +12/08/2025 23:52:22 - INFO - __main__ - Step: 6990 Loss: 10.2765 LR: 0.000030 +12/08/2025 23:53:04 - INFO - __main__ - Step: 7000 Loss: 10.3480 LR: 0.000030 +12/08/2025 23:53:04 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000 +12/08/2025 23:53:11 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/optimizer.bin +12/08/2025 23:53:11 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/scheduler.bin +12/08/2025 23:53:11 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/sampler.bin +12/08/2025 23:53:11 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000/random_states_0.pkl +12/08/2025 23:53:11 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7000 +12/08/2025 23:53:11 - INFO - __main__ - Generating videos for validation... +12/08/2025 23:53:11 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.88it/s] +12/08/2025 23:53:18 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/08/2025 23:53:59 - INFO - __main__ - Step: 7010 Loss: 10.2368 LR: 0.000030 +12/08/2025 23:54:42 - INFO - __main__ - Step: 7020 Loss: 10.1803 LR: 0.000030 +12/08/2025 23:55:24 - INFO - __main__ - Step: 7030 Loss: 10.2376 LR: 0.000030 +12/08/2025 23:56:05 - INFO - __main__ - Step: 7040 Loss: 10.2110 LR: 0.000030 +12/08/2025 23:56:47 - INFO - __main__ - Step: 7050 Loss: 10.3305 LR: 0.000030 +12/08/2025 23:57:29 - INFO - __main__ - Step: 7060 Loss: 10.2698 LR: 0.000030 +12/08/2025 23:58:10 - INFO - __main__ - Step: 7070 Loss: 10.3615 LR: 0.000030 +12/08/2025 23:58:52 - INFO - __main__ - Step: 7080 Loss: 10.1830 LR: 0.000030 +12/08/2025 23:59:34 - INFO - __main__ - Step: 7090 Loss: 10.1951 LR: 0.000030 +12/09/2025 00:00:17 - INFO - __main__ - Step: 7100 Loss: 10.1074 LR: 0.000030 +12/09/2025 00:00:59 - INFO - __main__ - Step: 7110 Loss: 10.3386 LR: 0.000030 +12/09/2025 00:01:43 - INFO - __main__ - Step: 7120 Loss: 9.8956 LR: 0.000030 +12/09/2025 00:02:25 - INFO - __main__ - Step: 7130 Loss: 10.3152 LR: 0.000030 +12/09/2025 00:03:08 - INFO - __main__ - Step: 7140 Loss: 10.3875 LR: 0.000030 +12/09/2025 00:03:50 - INFO - __main__ - Step: 7150 Loss: 10.3053 LR: 0.000030 +12/09/2025 00:04:32 - INFO - __main__ - Step: 7160 Loss: 10.3367 LR: 0.000030 +12/09/2025 00:05:15 - INFO - __main__ - Step: 7170 Loss: 10.2307 LR: 0.000030 +12/09/2025 00:05:57 - INFO - __main__ - Step: 7180 Loss: 10.2600 LR: 0.000030 +12/09/2025 00:06:40 - INFO - __main__ - Step: 7190 Loss: 10.3188 LR: 0.000030 +12/09/2025 00:07:22 - INFO - __main__ - Step: 7200 Loss: 10.2022 LR: 0.000030 +12/09/2025 00:08:03 - INFO - __main__ - Step: 7210 Loss: 10.2719 LR: 0.000030 +12/09/2025 00:08:43 - INFO - __main__ - Step: 7220 Loss: 10.3750 LR: 0.000030 +12/09/2025 00:09:24 - INFO - __main__ - Step: 7230 Loss: 10.2134 LR: 0.000030 +12/09/2025 00:10:06 - INFO - __main__ - Step: 7240 Loss: 10.3141 LR: 0.000030 +12/09/2025 00:10:48 - INFO - __main__ - Step: 7250 Loss: 10.2763 LR: 0.000030 +12/09/2025 00:11:30 - INFO - __main__ - Step: 7260 Loss: 10.3463 LR: 0.000030 +12/09/2025 00:12:12 - INFO - __main__ - Step: 7270 Loss: 10.0836 LR: 0.000030 +12/09/2025 00:12:53 - INFO - __main__ - Step: 7280 Loss: 10.2993 LR: 0.000030 +12/09/2025 00:13:34 - INFO - __main__ - Step: 7290 Loss: 10.3237 LR: 0.000030 +12/09/2025 00:14:15 - INFO - __main__ - Step: 7300 Loss: 10.2270 LR: 0.000030 +12/09/2025 00:14:56 - INFO - __main__ - Step: 7310 Loss: 10.2369 LR: 0.000030 +12/09/2025 00:15:38 - INFO - __main__ - Step: 7320 Loss: 10.2477 LR: 0.000030 +12/09/2025 00:16:20 - INFO - __main__ - Step: 7330 Loss: 10.3374 LR: 0.000030 +12/09/2025 00:17:02 - INFO - __main__ - Step: 7340 Loss: 9.8238 LR: 0.000030 +12/09/2025 00:17:44 - INFO - __main__ - Step: 7350 Loss: 10.3105 LR: 0.000030 +12/09/2025 00:18:26 - INFO - __main__ - Step: 7360 Loss: 10.1948 LR: 0.000030 +12/09/2025 00:19:06 - INFO - __main__ - Step: 7370 Loss: 10.1476 LR: 0.000030 +12/09/2025 00:19:47 - INFO - __main__ - Step: 7380 Loss: 10.1568 LR: 0.000030 +12/09/2025 00:20:28 - INFO - __main__ - Step: 7390 Loss: 10.3466 LR: 0.000030 +12/09/2025 00:21:09 - INFO - __main__ - Step: 7400 Loss: 10.3168 LR: 0.000030 +12/09/2025 00:21:50 - INFO - __main__ - Step: 7410 Loss: 10.1458 LR: 0.000030 +12/09/2025 00:22:31 - INFO - __main__ - Step: 7420 Loss: 10.2313 LR: 0.000030 +12/09/2025 00:23:10 - INFO - __main__ - Step: 7430 Loss: 9.8185 LR: 0.000030 +12/09/2025 00:23:50 - INFO - __main__ - Step: 7440 Loss: 10.1768 LR: 0.000030 +12/09/2025 00:24:29 - INFO - __main__ - Step: 7450 Loss: 10.3064 LR: 0.000030 +12/09/2025 00:25:10 - INFO - __main__ - Step: 7460 Loss: 10.1902 LR: 0.000030 +12/09/2025 00:25:49 - INFO - __main__ - Step: 7470 Loss: 10.3586 LR: 0.000030 +12/09/2025 00:26:29 - INFO - __main__ - Step: 7480 Loss: 10.2662 LR: 0.000030 +12/09/2025 00:27:09 - INFO - __main__ - Step: 7490 Loss: 10.3064 LR: 0.000030 +12/09/2025 00:27:50 - INFO - __main__ - Step: 7500 Loss: 10.2657 LR: 0.000030 +12/09/2025 00:27:50 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500 +12/09/2025 00:28:00 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/optimizer.bin +12/09/2025 00:28:00 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/scheduler.bin +12/09/2025 00:28:00 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/sampler.bin +12/09/2025 00:28:00 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500/random_states_0.pkl +12/09/2025 00:28:00 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-7500 +12/09/2025 00:28:00 - INFO - __main__ - Generating videos for validation... +12/09/2025 00:28:00 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.69it/s] +12/09/2025 00:28:06 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/09/2025 00:28:48 - INFO - __main__ - Step: 7510 Loss: 10.1254 LR: 0.000030 +12/09/2025 00:29:29 - INFO - __main__ - Step: 7520 Loss: 10.2718 LR: 0.000030 +12/09/2025 00:30:14 - INFO - __main__ - Step: 7530 Loss: 10.3707 LR: 0.000030 +12/09/2025 00:30:55 - INFO - __main__ - Step: 7540 Loss: 10.2575 LR: 0.000030 +12/09/2025 00:31:37 - INFO - __main__ - Step: 7550 Loss: 10.3026 LR: 0.000030 +12/09/2025 00:32:18 - INFO - __main__ - Step: 7560 Loss: 10.3086 LR: 0.000030 +12/09/2025 00:32:58 - INFO - __main__ - Step: 7570 Loss: 9.9485 LR: 0.000030 +12/09/2025 00:33:38 - INFO - __main__ - Step: 7580 Loss: 10.1990 LR: 0.000030 +12/09/2025 00:34:20 - INFO - __main__ - Step: 7590 Loss: 10.2143 LR: 0.000030 +12/09/2025 00:34:59 - INFO - __main__ - Step: 7600 Loss: 10.3632 LR: 0.000030 +12/09/2025 00:35:40 - INFO - __main__ - Step: 7610 Loss: 10.2531 LR: 0.000030 +12/09/2025 00:36:21 - INFO - __main__ - Step: 7620 Loss: 10.0739 LR: 0.000030 +12/09/2025 00:37:03 - INFO - __main__ - Step: 7630 Loss: 10.2402 LR: 0.000030 +12/09/2025 00:37:45 - INFO - __main__ - Step: 7640 Loss: 10.3283 LR: 0.000030 +12/09/2025 00:38:27 - INFO - __main__ - Step: 7650 Loss: 10.2601 LR: 0.000030 +12/09/2025 00:39:09 - INFO - __main__ - Step: 7660 Loss: 10.2100 LR: 0.000030 +12/09/2025 00:39:50 - INFO - __main__ - Step: 7670 Loss: 10.1779 LR: 0.000030 +12/09/2025 00:40:30 - INFO - __main__ - Step: 7680 Loss: 10.2502 LR: 0.000030 +12/09/2025 00:41:13 - INFO - __main__ - Step: 7690 Loss: 10.2797 LR: 0.000030 +12/09/2025 00:41:54 - INFO - __main__ - Step: 7700 Loss: 10.2384 LR: 0.000030 +12/09/2025 00:42:36 - INFO - __main__ - Step: 7710 Loss: 10.1634 LR: 0.000030 +12/09/2025 00:43:18 - INFO - __main__ - Step: 7720 Loss: 10.1549 LR: 0.000030 +12/09/2025 00:43:59 - INFO - __main__ - Step: 7730 Loss: 10.2102 LR: 0.000030 +12/09/2025 00:44:40 - INFO - __main__ - Step: 7740 Loss: 10.2895 LR: 0.000030 +12/09/2025 00:45:22 - INFO - __main__ - Step: 7750 Loss: 10.2618 LR: 0.000030 +12/09/2025 00:46:02 - INFO - __main__ - Step: 7760 Loss: 10.2558 LR: 0.000030 +12/09/2025 00:46:44 - INFO - __main__ - Step: 7770 Loss: 10.2425 LR: 0.000030 +12/09/2025 00:47:23 - INFO - __main__ - Step: 7780 Loss: 9.7965 LR: 0.000030 +12/09/2025 00:48:04 - INFO - __main__ - Step: 7790 Loss: 10.3123 LR: 0.000030 +12/09/2025 00:48:44 - INFO - __main__ - Step: 7800 Loss: 10.2295 LR: 0.000030 +12/09/2025 00:49:25 - INFO - __main__ - Step: 7810 Loss: 10.3369 LR: 0.000030 +12/09/2025 00:50:07 - INFO - __main__ - Step: 7820 Loss: 10.3905 LR: 0.000030 +12/09/2025 00:50:48 - INFO - __main__ - Step: 7830 Loss: 10.2884 LR: 0.000030 +12/09/2025 00:51:29 - INFO - __main__ - Step: 7840 Loss: 10.2730 LR: 0.000030 +12/09/2025 00:52:12 - INFO - __main__ - Step: 7850 Loss: 10.0763 LR: 0.000030 +12/09/2025 00:52:53 - INFO - __main__ - Step: 7860 Loss: 10.2126 LR: 0.000030 +12/09/2025 00:53:35 - INFO - __main__ - Step: 7870 Loss: 10.0994 LR: 0.000030 +12/09/2025 00:54:15 - INFO - __main__ - Step: 7880 Loss: 10.1571 LR: 0.000030 +12/09/2025 00:54:57 - INFO - __main__ - Step: 7890 Loss: 10.3652 LR: 0.000030 +12/09/2025 00:55:38 - INFO - __main__ - Step: 7900 Loss: 10.4227 LR: 0.000030 +12/09/2025 00:56:18 - INFO - __main__ - Step: 7910 Loss: 10.3050 LR: 0.000030 +12/09/2025 00:57:00 - INFO - __main__ - Step: 7920 Loss: 10.2642 LR: 0.000030 +12/09/2025 00:57:42 - INFO - __main__ - Step: 7930 Loss: 10.2392 LR: 0.000030 +12/09/2025 00:58:23 - INFO - __main__ - Step: 7940 Loss: 10.3895 LR: 0.000030 +12/09/2025 00:59:04 - INFO - __main__ - Step: 7950 Loss: 9.9881 LR: 0.000030 +12/09/2025 00:59:45 - INFO - __main__ - Step: 7960 Loss: 10.2459 LR: 0.000030 +12/09/2025 01:00:27 - INFO - __main__ - Step: 7970 Loss: 10.3424 LR: 0.000030 +12/09/2025 01:01:11 - INFO - __main__ - Step: 7980 Loss: 10.2866 LR: 0.000030 +12/09/2025 01:01:53 - INFO - __main__ - Step: 7990 Loss: 10.2693 LR: 0.000030 +12/09/2025 01:02:36 - INFO - __main__ - Step: 8000 Loss: 10.3038 LR: 0.000030 +12/09/2025 01:02:36 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000 +12/09/2025 01:02:43 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/optimizer.bin +12/09/2025 01:02:43 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/scheduler.bin +12/09/2025 01:02:43 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/sampler.bin +12/09/2025 01:02:43 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000/random_states_0.pkl +12/09/2025 01:02:43 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8000 +12/09/2025 01:02:43 - INFO - __main__ - Generating videos for validation... +12/09/2025 01:02:43 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.65it/s] +12/09/2025 01:02:50 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/09/2025 01:03:31 - INFO - __main__ - Step: 8010 Loss: 10.1872 LR: 0.000030 +12/09/2025 01:04:14 - INFO - __main__ - Step: 8020 Loss: 10.2005 LR: 0.000030 +12/09/2025 01:04:57 - INFO - __main__ - Step: 8030 Loss: 10.2280 LR: 0.000030 +12/09/2025 01:05:40 - INFO - __main__ - Step: 8040 Loss: 10.2187 LR: 0.000030 +12/09/2025 01:06:22 - INFO - __main__ - Step: 8050 Loss: 10.3216 LR: 0.000030 +12/09/2025 01:07:05 - INFO - __main__ - Step: 8060 Loss: 10.3150 LR: 0.000030 +12/09/2025 01:07:46 - INFO - __main__ - Step: 8070 Loss: 10.2519 LR: 0.000030 +12/09/2025 01:08:27 - INFO - __main__ - Step: 8080 Loss: 10.1123 LR: 0.000030 +12/09/2025 01:09:08 - INFO - __main__ - Step: 8090 Loss: 10.2094 LR: 0.000030 +12/09/2025 01:09:49 - INFO - __main__ - Step: 8100 Loss: 10.2756 LR: 0.000030 +12/09/2025 01:10:33 - INFO - __main__ - Step: 8110 Loss: 10.2123 LR: 0.000030 +12/09/2025 01:11:14 - INFO - __main__ - Step: 8120 Loss: 10.2951 LR: 0.000030 +12/09/2025 01:11:54 - INFO - __main__ - Step: 8130 Loss: 10.2351 LR: 0.000030 +12/09/2025 01:12:34 - INFO - __main__ - Step: 8140 Loss: 10.2263 LR: 0.000030 +12/09/2025 01:13:16 - INFO - __main__ - Step: 8150 Loss: 10.3499 LR: 0.000030 +12/09/2025 01:13:59 - INFO - __main__ - Step: 8160 Loss: 10.1430 LR: 0.000030 +12/09/2025 01:14:41 - INFO - __main__ - Step: 8170 Loss: 10.1110 LR: 0.000030 +12/09/2025 01:15:22 - INFO - __main__ - Step: 8180 Loss: 10.3154 LR: 0.000030 +12/09/2025 01:16:04 - INFO - __main__ - Step: 8190 Loss: 10.2972 LR: 0.000030 +12/09/2025 01:16:44 - INFO - __main__ - Step: 8200 Loss: 10.3163 LR: 0.000030 +12/09/2025 01:17:24 - INFO - __main__ - Step: 8210 Loss: 10.2279 LR: 0.000030 +12/09/2025 01:18:05 - INFO - __main__ - Step: 8220 Loss: 10.1925 LR: 0.000030 +12/09/2025 01:18:45 - INFO - __main__ - Step: 8230 Loss: 10.2277 LR: 0.000030 +12/09/2025 01:19:27 - INFO - __main__ - Step: 8240 Loss: 10.2339 LR: 0.000030 +12/09/2025 01:20:07 - INFO - __main__ - Step: 8250 Loss: 10.2727 LR: 0.000030 +12/09/2025 01:20:46 - INFO - __main__ - Step: 8260 Loss: 10.2094 LR: 0.000030 +12/09/2025 01:21:26 - INFO - __main__ - Step: 8270 Loss: 10.2677 LR: 0.000030 +12/09/2025 01:22:06 - INFO - __main__ - Step: 8280 Loss: 10.2524 LR: 0.000030 +12/09/2025 01:22:47 - INFO - __main__ - Step: 8290 Loss: 10.1834 LR: 0.000030 +12/09/2025 01:23:28 - INFO - __main__ - Step: 8300 Loss: 10.2830 LR: 0.000030 +12/09/2025 01:24:09 - INFO - __main__ - Step: 8310 Loss: 10.2735 LR: 0.000030 +12/09/2025 01:24:49 - INFO - __main__ - Step: 8320 Loss: 10.2476 LR: 0.000030 +12/09/2025 01:25:31 - INFO - __main__ - Step: 8330 Loss: 10.2696 LR: 0.000030 +12/09/2025 01:26:10 - INFO - __main__ - Step: 8340 Loss: 10.2254 LR: 0.000030 +12/09/2025 01:26:51 - INFO - __main__ - Step: 8350 Loss: 9.8749 LR: 0.000030 +12/09/2025 01:27:31 - INFO - __main__ - Step: 8360 Loss: 10.2868 LR: 0.000030 +12/09/2025 01:28:12 - INFO - __main__ - Step: 8370 Loss: 10.3299 LR: 0.000030 +12/09/2025 01:28:53 - INFO - __main__ - Step: 8380 Loss: 10.1383 LR: 0.000030 +12/09/2025 01:29:32 - INFO - __main__ - Step: 8390 Loss: 10.1987 LR: 0.000030 +12/09/2025 01:30:12 - INFO - __main__ - Step: 8400 Loss: 10.3752 LR: 0.000030 +12/09/2025 01:30:52 - INFO - __main__ - Step: 8410 Loss: 10.0485 LR: 0.000030 +12/09/2025 01:31:33 - INFO - __main__ - Step: 8420 Loss: 10.1510 LR: 0.000030 +12/09/2025 01:32:14 - INFO - __main__ - Step: 8430 Loss: 10.0730 LR: 0.000030 +12/09/2025 01:32:55 - INFO - __main__ - Step: 8440 Loss: 10.1951 LR: 0.000030 +12/09/2025 01:33:35 - INFO - __main__ - Step: 8450 Loss: 10.2214 LR: 0.000030 +12/09/2025 01:34:16 - INFO - __main__ - Step: 8460 Loss: 10.2708 LR: 0.000030 +12/09/2025 01:34:57 - INFO - __main__ - Step: 8470 Loss: 9.8752 LR: 0.000030 +12/09/2025 01:35:37 - INFO - __main__ - Step: 8480 Loss: 10.2421 LR: 0.000030 +12/09/2025 01:36:17 - INFO - __main__ - Step: 8490 Loss: 10.2831 LR: 0.000030 +12/09/2025 01:36:57 - INFO - __main__ - Step: 8500 Loss: 10.2521 LR: 0.000030 +12/09/2025 01:36:57 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500 +12/09/2025 01:37:06 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/optimizer.bin +12/09/2025 01:37:06 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/scheduler.bin +12/09/2025 01:37:06 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/sampler.bin +12/09/2025 01:37:06 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500/random_states_0.pkl +12/09/2025 01:37:06 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-8500 +12/09/2025 01:37:06 - INFO - __main__ - Generating videos for validation... +12/09/2025 01:37:06 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.80it/s] +12/09/2025 01:37:12 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/09/2025 01:37:53 - INFO - __main__ - Step: 8510 Loss: 10.2791 LR: 0.000030 +12/09/2025 01:38:33 - INFO - __main__ - Step: 8520 Loss: 10.2445 LR: 0.000030 +12/09/2025 01:39:14 - INFO - __main__ - Step: 8530 Loss: 10.0240 LR: 0.000030 +12/09/2025 01:39:55 - INFO - __main__ - Step: 8540 Loss: 10.2756 LR: 0.000030 +12/09/2025 01:40:40 - INFO - __main__ - Step: 8550 Loss: 10.1570 LR: 0.000030 +12/09/2025 01:41:21 - INFO - __main__ - Step: 8560 Loss: 10.1915 LR: 0.000030 +12/09/2025 01:42:02 - INFO - __main__ - Step: 8570 Loss: 10.2468 LR: 0.000030 +12/09/2025 01:42:43 - INFO - __main__ - Step: 8580 Loss: 10.2841 LR: 0.000030 +12/09/2025 01:43:26 - INFO - __main__ - Step: 8590 Loss: 10.3508 LR: 0.000030 +12/09/2025 01:44:06 - INFO - __main__ - Step: 8600 Loss: 10.2955 LR: 0.000030 +12/09/2025 01:44:46 - INFO - __main__ - Step: 8610 Loss: 10.2341 LR: 0.000030 +12/09/2025 01:45:26 - INFO - __main__ - Step: 8620 Loss: 10.3108 LR: 0.000030 +12/09/2025 01:46:08 - INFO - __main__ - Step: 8630 Loss: 10.2283 LR: 0.000030 +12/09/2025 01:46:49 - INFO - __main__ - Step: 8640 Loss: 10.1501 LR: 0.000030 +12/09/2025 01:47:30 - INFO - __main__ - Step: 8650 Loss: 10.4081 LR: 0.000030 +12/09/2025 01:48:11 - INFO - __main__ - Step: 8660 Loss: 10.1406 LR: 0.000030 +12/09/2025 01:48:52 - INFO - __main__ - Step: 8670 Loss: 10.1809 LR: 0.000030 +12/09/2025 01:49:33 - INFO - __main__ - Step: 8680 Loss: 10.3772 LR: 0.000030 +12/09/2025 01:50:14 - INFO - __main__ - Step: 8690 Loss: 10.2832 LR: 0.000030 +12/09/2025 01:50:54 - INFO - __main__ - Step: 8700 Loss: 10.3794 LR: 0.000030 +12/09/2025 01:51:35 - INFO - __main__ - Step: 8710 Loss: 9.9623 LR: 0.000030 +12/09/2025 01:52:15 - INFO - __main__ - Step: 8720 Loss: 10.1601 LR: 0.000030 +12/09/2025 01:52:57 - INFO - __main__ - Step: 8730 Loss: 10.2284 LR: 0.000030 +12/09/2025 01:53:39 - INFO - __main__ - Step: 8740 Loss: 10.0540 LR: 0.000030 +12/09/2025 01:54:20 - INFO - __main__ - Step: 8750 Loss: 10.1500 LR: 0.000030 +12/09/2025 01:55:01 - INFO - __main__ - Step: 8760 Loss: 10.0504 LR: 0.000030 +12/09/2025 01:55:43 - INFO - __main__ - Step: 8770 Loss: 10.1850 LR: 0.000030 +12/09/2025 01:56:23 - INFO - __main__ - Step: 8780 Loss: 10.1295 LR: 0.000030 +12/09/2025 01:57:03 - INFO - __main__ - Step: 8790 Loss: 10.3019 LR: 0.000030 +12/09/2025 01:57:44 - INFO - __main__ - Step: 8800 Loss: 10.3225 LR: 0.000030 +12/09/2025 01:58:25 - INFO - __main__ - Step: 8810 Loss: 10.2366 LR: 0.000030 +12/09/2025 01:59:06 - INFO - __main__ - Step: 8820 Loss: 10.2086 LR: 0.000030 +12/09/2025 01:59:47 - INFO - __main__ - Step: 8830 Loss: 10.2784 LR: 0.000030 +12/09/2025 02:00:27 - INFO - __main__ - Step: 8840 Loss: 10.3409 LR: 0.000030 +12/09/2025 02:01:08 - INFO - __main__ - Step: 8850 Loss: 10.2942 LR: 0.000030 +12/09/2025 02:01:49 - INFO - __main__ - Step: 8860 Loss: 10.1809 LR: 0.000030 +12/09/2025 02:02:28 - INFO - __main__ - Step: 8870 Loss: 10.2588 LR: 0.000030 +12/09/2025 02:03:10 - INFO - __main__ - Step: 8880 Loss: 10.1412 LR: 0.000030 +12/09/2025 02:03:52 - INFO - __main__ - Step: 8890 Loss: 10.2051 LR: 0.000030 +12/09/2025 02:04:32 - INFO - __main__ - Step: 8900 Loss: 10.2463 LR: 0.000030 +12/09/2025 02:05:11 - INFO - __main__ - Step: 8910 Loss: 10.3263 LR: 0.000030 +12/09/2025 02:05:52 - INFO - __main__ - Step: 8920 Loss: 10.2405 LR: 0.000030 +12/09/2025 02:06:32 - INFO - __main__ - Step: 8930 Loss: 10.4588 LR: 0.000030 +12/09/2025 02:07:11 - INFO - __main__ - Step: 8940 Loss: 10.0509 LR: 0.000030 +12/09/2025 02:07:51 - INFO - __main__ - Step: 8950 Loss: 10.1175 LR: 0.000030 +12/09/2025 02:08:30 - INFO - __main__ - Step: 8960 Loss: 10.1086 LR: 0.000030 +12/09/2025 02:09:10 - INFO - __main__ - Step: 8970 Loss: 10.2881 LR: 0.000030 +12/09/2025 02:09:49 - INFO - __main__ - Step: 8980 Loss: 9.8539 LR: 0.000030 +12/09/2025 02:10:31 - INFO - __main__ - Step: 8990 Loss: 10.3159 LR: 0.000030 +12/09/2025 02:11:11 - INFO - __main__ - Step: 9000 Loss: 10.3114 LR: 0.000030 +12/09/2025 02:11:11 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000 +12/09/2025 02:11:18 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/optimizer.bin +12/09/2025 02:11:18 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/scheduler.bin +12/09/2025 02:11:18 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/sampler.bin +12/09/2025 02:11:18 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000/random_states_0.pkl +12/09/2025 02:11:18 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9000 +12/09/2025 02:11:18 - INFO - __main__ - Generating videos for validation... +12/09/2025 02:11:18 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.79it/s] +12/09/2025 02:11:25 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/09/2025 02:12:05 - INFO - __main__ - Step: 9010 Loss: 10.3593 LR: 0.000030 +12/09/2025 02:12:46 - INFO - __main__ - Step: 9020 Loss: 10.3540 LR: 0.000030 +12/09/2025 02:13:30 - INFO - __main__ - Step: 9030 Loss: 10.0327 LR: 0.000030 +12/09/2025 02:14:11 - INFO - __main__ - Step: 9040 Loss: 10.3993 LR: 0.000030 +12/09/2025 02:14:52 - INFO - __main__ - Step: 9050 Loss: 10.1170 LR: 0.000030 +12/09/2025 02:15:33 - INFO - __main__ - Step: 9060 Loss: 10.2321 LR: 0.000030 +12/09/2025 02:16:14 - INFO - __main__ - Step: 9070 Loss: 10.2650 LR: 0.000030 +12/09/2025 02:16:55 - INFO - __main__ - Step: 9080 Loss: 10.3733 LR: 0.000030 +12/09/2025 02:17:36 - INFO - __main__ - Step: 9090 Loss: 10.3145 LR: 0.000030 +12/09/2025 02:18:16 - INFO - __main__ - Step: 9100 Loss: 10.1167 LR: 0.000030 +12/09/2025 02:18:57 - INFO - __main__ - Step: 9110 Loss: 10.1362 LR: 0.000030 +12/09/2025 02:19:38 - INFO - __main__ - Step: 9120 Loss: 10.2166 LR: 0.000030 +12/09/2025 02:20:18 - INFO - __main__ - Step: 9130 Loss: 10.3137 LR: 0.000030 +12/09/2025 02:20:59 - INFO - __main__ - Step: 9140 Loss: 10.3315 LR: 0.000030 +12/09/2025 02:21:40 - INFO - __main__ - Step: 9150 Loss: 10.3195 LR: 0.000030 +12/09/2025 02:22:21 - INFO - __main__ - Step: 9160 Loss: 10.2026 LR: 0.000030 +12/09/2025 02:23:00 - INFO - __main__ - Step: 9170 Loss: 10.3019 LR: 0.000030 +12/09/2025 02:23:42 - INFO - __main__ - Step: 9180 Loss: 10.2923 LR: 0.000030 +12/09/2025 02:24:23 - INFO - __main__ - Step: 9190 Loss: 10.3703 LR: 0.000030 +12/09/2025 02:25:04 - INFO - __main__ - Step: 9200 Loss: 10.0775 LR: 0.000030 +12/09/2025 02:25:44 - INFO - __main__ - Step: 9210 Loss: 10.0284 LR: 0.000030 +12/09/2025 02:26:26 - INFO - __main__ - Step: 9220 Loss: 10.0956 LR: 0.000030 +12/09/2025 02:27:06 - INFO - __main__ - Step: 9230 Loss: 10.3597 LR: 0.000030 +12/09/2025 02:27:47 - INFO - __main__ - Step: 9240 Loss: 10.1692 LR: 0.000030 +12/09/2025 02:28:29 - INFO - __main__ - Step: 9250 Loss: 10.3566 LR: 0.000030 +12/09/2025 02:29:09 - INFO - __main__ - Step: 9260 Loss: 9.9540 LR: 0.000030 +12/09/2025 02:29:50 - INFO - __main__ - Step: 9270 Loss: 10.1716 LR: 0.000030 +12/09/2025 02:30:30 - INFO - __main__ - Step: 9280 Loss: 10.0799 LR: 0.000030 +12/09/2025 02:31:11 - INFO - __main__ - Step: 9290 Loss: 10.2651 LR: 0.000030 +12/09/2025 02:31:52 - INFO - __main__ - Step: 9300 Loss: 10.2888 LR: 0.000030 +12/09/2025 02:32:33 - INFO - __main__ - Step: 9310 Loss: 10.3632 LR: 0.000030 +12/09/2025 02:33:13 - INFO - __main__ - Step: 9320 Loss: 10.1456 LR: 0.000030 +12/09/2025 02:33:54 - INFO - __main__ - Step: 9330 Loss: 10.0606 LR: 0.000030 +12/09/2025 02:34:34 - INFO - __main__ - Step: 9340 Loss: 10.1240 LR: 0.000030 +12/09/2025 02:35:15 - INFO - __main__ - Step: 9350 Loss: 10.3006 LR: 0.000030 +12/09/2025 02:35:57 - INFO - __main__ - Step: 9360 Loss: 10.3873 LR: 0.000030 +12/09/2025 02:36:38 - INFO - __main__ - Step: 9370 Loss: 10.2983 LR: 0.000030 +12/09/2025 02:37:18 - INFO - __main__ - Step: 9380 Loss: 10.3257 LR: 0.000030 +12/09/2025 02:37:58 - INFO - __main__ - Step: 9390 Loss: 10.2567 LR: 0.000030 +12/09/2025 02:38:39 - INFO - __main__ - Step: 9400 Loss: 10.3199 LR: 0.000030 +12/09/2025 02:39:18 - INFO - __main__ - Step: 9410 Loss: 9.8679 LR: 0.000030 +12/09/2025 02:39:59 - INFO - __main__ - Step: 9420 Loss: 10.3053 LR: 0.000030 +12/09/2025 02:40:41 - INFO - __main__ - Step: 9430 Loss: 10.1604 LR: 0.000030 +12/09/2025 02:41:23 - INFO - __main__ - Step: 9440 Loss: 10.2145 LR: 0.000030 +12/09/2025 02:42:03 - INFO - __main__ - Step: 9450 Loss: 10.3038 LR: 0.000030 +12/09/2025 02:42:44 - INFO - __main__ - Step: 9460 Loss: 10.2254 LR: 0.000030 +12/09/2025 02:43:25 - INFO - __main__ - Step: 9470 Loss: 10.4178 LR: 0.000030 +12/09/2025 02:44:06 - INFO - __main__ - Step: 9480 Loss: 10.1861 LR: 0.000030 +12/09/2025 02:44:47 - INFO - __main__ - Step: 9490 Loss: 10.3053 LR: 0.000030 +12/09/2025 02:45:27 - INFO - __main__ - Step: 9500 Loss: 10.0535 LR: 0.000030 +12/09/2025 02:45:27 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500 +12/09/2025 02:45:37 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/optimizer.bin +12/09/2025 02:45:37 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/scheduler.bin +12/09/2025 02:45:37 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/sampler.bin +12/09/2025 02:45:37 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500/random_states_0.pkl +12/09/2025 02:45:37 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-9500 +12/09/2025 02:45:37 - INFO - __main__ - Generating videos for validation... +12/09/2025 02:45:37 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.69it/s] +12/09/2025 02:45:44 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/09/2025 02:46:25 - INFO - __main__ - Step: 9510 Loss: 10.1563 LR: 0.000030 +12/09/2025 02:47:07 - INFO - __main__ - Step: 9520 Loss: 10.0036 LR: 0.000030 +12/09/2025 02:47:50 - INFO - __main__ - Step: 9530 Loss: 10.0589 LR: 0.000030 +12/09/2025 02:48:32 - INFO - __main__ - Step: 9540 Loss: 10.3363 LR: 0.000030 +12/09/2025 02:49:14 - INFO - __main__ - Step: 9550 Loss: 10.3414 LR: 0.000030 +12/09/2025 02:49:55 - INFO - __main__ - Step: 9560 Loss: 10.2372 LR: 0.000030 +12/09/2025 02:50:37 - INFO - __main__ - Step: 9570 Loss: 10.1867 LR: 0.000030 +12/09/2025 02:51:18 - INFO - __main__ - Step: 9580 Loss: 10.2701 LR: 0.000030 +12/09/2025 02:52:01 - INFO - __main__ - Step: 9590 Loss: 9.9317 LR: 0.000030 +12/09/2025 02:52:43 - INFO - __main__ - Step: 9600 Loss: 10.1395 LR: 0.000030 +12/09/2025 02:53:26 - INFO - __main__ - Step: 9610 Loss: 10.1316 LR: 0.000030 +12/09/2025 02:54:08 - INFO - __main__ - Step: 9620 Loss: 10.2242 LR: 0.000030 +12/09/2025 02:54:51 - INFO - __main__ - Step: 9630 Loss: 10.2056 LR: 0.000030 +12/09/2025 02:55:31 - INFO - __main__ - Step: 9640 Loss: 9.9158 LR: 0.000030 +12/09/2025 02:56:12 - INFO - __main__ - Step: 9650 Loss: 10.2432 LR: 0.000030 +12/09/2025 02:56:53 - INFO - __main__ - Step: 9660 Loss: 10.3903 LR: 0.000030 +12/09/2025 02:57:35 - INFO - __main__ - Step: 9670 Loss: 10.2608 LR: 0.000030 +12/09/2025 02:58:16 - INFO - __main__ - Step: 9680 Loss: 10.2800 LR: 0.000030 +12/09/2025 02:58:57 - INFO - __main__ - Step: 9690 Loss: 10.1646 LR: 0.000030 +12/09/2025 02:59:40 - INFO - __main__ - Step: 9700 Loss: 10.3005 LR: 0.000030 +12/09/2025 03:00:20 - INFO - __main__ - Step: 9710 Loss: 10.3200 LR: 0.000030 +12/09/2025 03:01:03 - INFO - __main__ - Step: 9720 Loss: 10.1391 LR: 0.000030 +12/09/2025 03:01:46 - INFO - __main__ - Step: 9730 Loss: 10.0638 LR: 0.000030 +12/09/2025 03:02:27 - INFO - __main__ - Step: 9740 Loss: 10.3703 LR: 0.000030 +12/09/2025 03:03:08 - INFO - __main__ - Step: 9750 Loss: 10.0388 LR: 0.000030 +12/09/2025 03:03:50 - INFO - __main__ - Step: 9760 Loss: 10.2022 LR: 0.000030 +12/09/2025 03:04:33 - INFO - __main__ - Step: 9770 Loss: 10.2557 LR: 0.000030 +12/09/2025 03:05:15 - INFO - __main__ - Step: 9780 Loss: 10.3068 LR: 0.000030 +12/09/2025 03:05:55 - INFO - __main__ - Step: 9790 Loss: 10.1136 LR: 0.000030 +12/09/2025 03:06:36 - INFO - __main__ - Step: 9800 Loss: 10.0547 LR: 0.000030 +12/09/2025 03:07:18 - INFO - __main__ - Step: 9810 Loss: 10.1827 LR: 0.000030 +12/09/2025 03:08:01 - INFO - __main__ - Step: 9820 Loss: 10.2698 LR: 0.000030 +12/09/2025 03:08:42 - INFO - __main__ - Step: 9830 Loss: 10.3142 LR: 0.000030 +12/09/2025 03:09:25 - INFO - __main__ - Step: 9840 Loss: 10.1912 LR: 0.000030 +12/09/2025 03:10:07 - INFO - __main__ - Step: 9850 Loss: 10.1412 LR: 0.000030 +12/09/2025 03:10:48 - INFO - __main__ - Step: 9860 Loss: 10.3545 LR: 0.000030 +12/09/2025 03:11:30 - INFO - __main__ - Step: 9870 Loss: 10.2420 LR: 0.000030 +12/09/2025 03:12:12 - INFO - __main__ - Step: 9880 Loss: 10.0121 LR: 0.000030 +12/09/2025 03:12:53 - INFO - __main__ - Step: 9890 Loss: 10.1630 LR: 0.000030 +12/09/2025 03:13:35 - INFO - __main__ - Step: 9900 Loss: 10.2106 LR: 0.000030 +12/09/2025 03:14:17 - INFO - __main__ - Step: 9910 Loss: 9.8999 LR: 0.000030 +12/09/2025 03:14:58 - INFO - __main__ - Step: 9920 Loss: 10.2557 LR: 0.000030 +12/09/2025 03:15:42 - INFO - __main__ - Step: 9930 Loss: 9.9058 LR: 0.000030 +12/09/2025 03:16:22 - INFO - __main__ - Step: 9940 Loss: 10.3045 LR: 0.000030 +12/09/2025 03:17:04 - INFO - __main__ - Step: 9950 Loss: 10.1141 LR: 0.000030 +12/09/2025 03:17:46 - INFO - __main__ - Step: 9960 Loss: 9.8588 LR: 0.000030 +12/09/2025 03:18:26 - INFO - __main__ - Step: 9970 Loss: 10.1512 LR: 0.000030 +12/09/2025 03:19:07 - INFO - __main__ - Step: 9980 Loss: 10.2319 LR: 0.000030 +12/09/2025 03:19:48 - INFO - __main__ - Step: 9990 Loss: 10.2942 LR: 0.000030 +12/09/2025 03:20:29 - INFO - __main__ - Step: 10000 Loss: 10.2682 LR: 0.000030 +12/09/2025 03:20:29 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000 +12/09/2025 03:20:36 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/optimizer.bin +12/09/2025 03:20:36 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/scheduler.bin +12/09/2025 03:20:36 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/sampler.bin +12/09/2025 03:20:36 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_0.pkl +12/09/2025 03:20:36 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000 +12/09/2025 03:20:36 - INFO - __main__ - Generating videos for validation... +12/09/2025 03:20:36 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.87it/s] +12/09/2025 03:20:43 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio +12/09/2025 03:20:43 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000 +12/09/2025 03:21:33 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/optimizer.bin +12/09/2025 03:21:33 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/scheduler.bin +12/09/2025 03:21:33 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/sampler.bin +12/09/2025 03:21:33 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000/random_states_0.pkl +12/09/2025 03:21:33 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio/checkpoint-10000 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/requirements.txt b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/wandb-metadata.json b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3fc9277d595e50f2605af5f4a0975cbb8a2651 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-08T15:59:43.669474Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "11900305113088" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "f0bt4siuu2gjzmv16hc4nme2mcfrsfsi" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/wandb-summary.json b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3b953ddbecc4ea9ace17ddb013fd672872480265 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/files/wandb-summary.json @@ -0,0 +1 @@ +{"_timestamp":1.76525044327647e+09,"generated_videos_first_frame":{"filenames":["media/images/generated_videos_first_frame_10000_8328d2d0556a95ff2759.png","media/images/generated_videos_first_frame_10000_980ee3261a5cf9cce942.png"],"captions":["a cat playing","a girl walking"],"_type":"images/separated","width":448,"height":256,"format":"png","count":2},"_wandb":{"runtime":40912},"_runtime":40912.716925498,"_step":10000,"step_loss":10.268203735351562,"lr":2.9999999999999997e-05,"avg_masking_rate":0.7040722370147705} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-core.log b/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..6841dc7c46b112e0e43c985c9bbd45c460faaa0d --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-core.log @@ -0,0 +1,16 @@ +{"time":"2025-12-08T15:59:43.744475852Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpmaf9gegn/port-3462940.txt","pid":3462940,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-08T15:59:43.744995335Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":3462940} +{"time":"2025-12-08T15:59:43.744970513Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3462940-3463253-4089553982/socket","Net":"unix"}} +{"time":"2025-12-08T15:59:43.929408895Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-08T15:59:43.935485908Z","level":"INFO","msg":"handleInformInit: received","streamId":"j5rc8ish","id":"1(@)"} +{"time":"2025-12-08T15:59:44.101834101Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"j5rc8ish","id":"1(@)"} +{"time":"2025-12-09T03:21:37.737213016Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"j5rc8ish","id":"1(@)"} +{"time":"2025-12-09T03:21:37.739815436Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"j5rc8ish","id":"1(@)"} +{"time":"2025-12-09T03:22:18.88051873Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T03:22:18.880564702Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T03:22:18.88057283Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T03:22:18.880597849Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T03:22:18.880709214Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T03:22:18.880741969Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T03:22:18.880796739Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-3462940-3463253-4089553982/socket","Net":"unix"}} +{"time":"2025-12-09T03:22:18.880837027Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-internal.log b/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..95a069e458685459f654cd342fc0f563e3763ff1 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-08T15:59:43.935580663Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-08T15:59:44.101633338Z","level":"INFO","msg":"stream: created new stream","id":"j5rc8ish"} +{"time":"2025-12-08T15:59:44.101738834Z","level":"INFO","msg":"handler: started","stream_id":"j5rc8ish"} +{"time":"2025-12-08T15:59:44.101827368Z","level":"INFO","msg":"stream: started","id":"j5rc8ish"} +{"time":"2025-12-08T15:59:44.101850473Z","level":"INFO","msg":"sender: started","stream_id":"j5rc8ish"} +{"time":"2025-12-08T15:59:44.101851999Z","level":"INFO","msg":"writer: started","stream_id":"j5rc8ish"} +{"time":"2025-12-09T03:21:37.602552256Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T03:21:37.733763019Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-09T03:21:37.737771064Z","level":"INFO","msg":"stream: closing","id":"j5rc8ish"} +{"time":"2025-12-09T03:21:37.737788193Z","level":"INFO","msg":"handler: closed","stream_id":"j5rc8ish"} +{"time":"2025-12-09T03:21:37.737842593Z","level":"INFO","msg":"sender: closed","stream_id":"j5rc8ish"} +{"time":"2025-12-09T03:21:37.73786844Z","level":"INFO","msg":"stream: closed","id":"j5rc8ish"} diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug.log b/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7a4809ac23c1f44e627d3d3eebd7fc37d752fe2b --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug.log @@ -0,0 +1,27 @@ +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_setup.py:_flush():80] Configure stats pid to 3462940 +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug.log +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251208_155943-j5rc8ish/logs/debug-internal.log +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_init.py:init():841] calling init triggers +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-08 15:59:43,673 INFO MainThread:3462940 [wandb_init.py:init():889] starting backend +2025-12-08 15:59:43,929 INFO MainThread:3462940 [wandb_init.py:init():892] sending inform_init request +2025-12-08 15:59:43,934 INFO MainThread:3462940 [wandb_init.py:init():900] backend started and connected +2025-12-08 15:59:43,935 INFO MainThread:3462940 [wandb_init.py:init():970] updated telemetry +2025-12-08 15:59:43,940 INFO MainThread:3462940 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-08 15:59:44,509 INFO MainThread:3462940 [wandb_init.py:init():1041] starting run threads in backend +2025-12-08 15:59:44,707 INFO MainThread:3462940 [wandb_run.py:_console_start():2521] atexit reg +2025-12-08 15:59:44,707 INFO MainThread:3462940 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-08 15:59:44,707 INFO MainThread:3462940 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-08 15:59:44,707 INFO MainThread:3462940 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-08 15:59:44,710 INFO MainThread:3462940 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-08 15:59:44,711 INFO MainThread:3462940 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_10_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.1} +2025-12-09 03:21:37,223 INFO MainThread:3462940 [wandb_run.py:_finish():2287] finishing run jin-bin/meissonic/j5rc8ish +2025-12-09 03:21:37,225 INFO MainThread:3462940 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0 +2025-12-09 03:21:37,225 INFO MainThread:3462940 [wandb_run.py:_restore():2468] restore +2025-12-09 03:21:37,225 INFO MainThread:3462940 [wandb_run.py:_restore():2474] restore done +2025-12-09 03:21:37,736 INFO MainThread:3462940 [wandb_run.py:_footer_sync_info():3862] logging synced files diff --git a/Meissonic/wandb/run-20251208_155943-j5rc8ish/run-j5rc8ish.wandb b/Meissonic/wandb/run-20251208_155943-j5rc8ish/run-j5rc8ish.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1cf5c5650e0e08e83074e1c99d544c76de08d3a8 --- /dev/null +++ b/Meissonic/wandb/run-20251208_155943-j5rc8ish/run-j5rc8ish.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488845c8db48654841acaecdc70247dd695f96008f786b03f3db6824b3222477 +size 10711439 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_4ea9441b252682155006.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_4ea9441b252682155006.png new file mode 100644 index 0000000000000000000000000000000000000000..e3686752e68ea4903b416f823d7715d05e0722b4 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_4ea9441b252682155006.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea9441b2526821550065be4282383a91f18e8f289e6f64c5286893c87f4e5e5 +size 235465 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_be5afcc9b61ce7cc9765.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_be5afcc9b61ce7cc9765.png new file mode 100644 index 0000000000000000000000000000000000000000..5dadeb5a0b3e104d148a2027d8c36f8c835acb99 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_be5afcc9b61ce7cc9765.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be5afcc9b61ce7cc976552a10997db35e39d3777b8ba46832f7b730c1fab3104 +size 234944 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_7c59a605f746fefa06f3.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_7c59a605f746fefa06f3.png new file mode 100644 index 0000000000000000000000000000000000000000..aa69874ba98b31a5706dbdae389a2ff9f07d8bf4 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_7c59a605f746fefa06f3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c59a605f746fefa06f3c8c0981023f314a8343302ce0498fbb4aebb85b5ea48 +size 234597 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_e846322d8d1fe1da0c06.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_e846322d8d1fe1da0c06.png new file mode 100644 index 0000000000000000000000000000000000000000..6c57f92a48a45124c8047e429dabfc630e682135 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_e846322d8d1fe1da0c06.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e846322d8d1fe1da0c06af5a8f74c3afcadf53c258aa36a35d4dc90e6722dbc3 +size 229025 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_e7251adb287026b97ff8.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_e7251adb287026b97ff8.png new file mode 100644 index 0000000000000000000000000000000000000000..288b5357bfd6f44c47c059f1b852223a08d2eea7 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_e7251adb287026b97ff8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7251adb287026b97ff807bb6fa787d23e29837fb1ab75855513e00dc2679e7d +size 232454 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_fb353e591b1b0dbac386.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_fb353e591b1b0dbac386.png new file mode 100644 index 0000000000000000000000000000000000000000..eab416e77eed30c536341ec9692595b8f1b922ac --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_fb353e591b1b0dbac386.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb353e591b1b0dbac38635e86c24871936ac696f4ff7674771a6ef15a513e45e +size 229162 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_4254c55c5a44dae8222b.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_4254c55c5a44dae8222b.png new file mode 100644 index 0000000000000000000000000000000000000000..257d8ee8ee776ff5b4131d710fcfc0c5ca82a250 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_4254c55c5a44dae8222b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4254c55c5a44dae8222b2f94225010db931e57096a09925bac1e768f58993672 +size 236913 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_880fb5b7bb7d55a41102.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_880fb5b7bb7d55a41102.png new file mode 100644 index 0000000000000000000000000000000000000000..db3f8e2868d11f353f5051aeb20b5e5c095b4dd8 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_880fb5b7bb7d55a41102.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880fb5b7bb7d55a411020da08650dc25fe858c588e36c6173cc832251ec76713 +size 234622 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_0af47ac2b0fd0a7b83b9.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_0af47ac2b0fd0a7b83b9.png new file mode 100644 index 0000000000000000000000000000000000000000..a66371578a8fe0146839e4faac46604f2d924376 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_0af47ac2b0fd0a7b83b9.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0af47ac2b0fd0a7b83b953ffea43750361aa0fa8df8302b1b8e87b311f86db99 +size 235174 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_38859ead3b87553090be.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_38859ead3b87553090be.png new file mode 100644 index 0000000000000000000000000000000000000000..14629585c7a65a8365edf4a781eb7e55bf217720 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_38859ead3b87553090be.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38859ead3b87553090be10142facdc7a4f8780e6d9d0b17eb4abf0bc124b0746 +size 229801 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_1b3f708ccf2664b9bd84.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_1b3f708ccf2664b9bd84.png new file mode 100644 index 0000000000000000000000000000000000000000..eb64442cbc0de5576a61c8404e2bce41579d5f5e --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_1b3f708ccf2664b9bd84.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b3f708ccf2664b9bd84c4142e94f63ed1f1d9fba426b1835eb30e04aad78ab3 +size 240388 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_96fc2c23d9374b5c001f.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_96fc2c23d9374b5c001f.png new file mode 100644 index 0000000000000000000000000000000000000000..768e69413dff2086b1265e4fd28b894fa044ed6a --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_96fc2c23d9374b5c001f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96fc2c23d9374b5c001fbbcc7e3cf2abc977c4fd89bff8872af08c61ab92cc00 +size 236161 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_7f60fcf85257e0427cb4.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_7f60fcf85257e0427cb4.png new file mode 100644 index 0000000000000000000000000000000000000000..933734f026856731f07931a02829589ce131bd38 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_7f60fcf85257e0427cb4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f60fcf85257e0427cb40bd8076e1243f9e42adfe61075674d62b0d5841bdc6a +size 231968 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_f36bf77eea280b84a34e.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_f36bf77eea280b84a34e.png new file mode 100644 index 0000000000000000000000000000000000000000..38098b236aed7aa104076c52235d007303a98341 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_f36bf77eea280b84a34e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36bf77eea280b84a34ed04791d44d542d3b4425864c9020408af51961bb2ba2 +size 227665 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_92b8064a4e25f8ad3702.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_92b8064a4e25f8ad3702.png new file mode 100644 index 0000000000000000000000000000000000000000..9783534a5783b293f8e0b68a1fe9172cb02ecc76 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_92b8064a4e25f8ad3702.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b8064a4e25f8ad370281aa9f5364c7b09de3cd29ae7ba63ca51dad6b402323 +size 227242 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_f6969510d28d905ce414.png b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_f6969510d28d905ce414.png new file mode 100644 index 0000000000000000000000000000000000000000..f4c5548f611f4b0aee0776407d41af5a021ad14d --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_f6969510d28d905ce414.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6969510d28d905ce414921aa949d88990ca6b2d81ab742cda228c22accb0028 +size 209920 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/output.log b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4549e12252fbc0b4acbd0e69ff8f35f4fd7b51eb --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/output.log @@ -0,0 +1,542 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 67.35it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 06:08:59 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6234.90it/s] +12/09/2025 06:09:01 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/09/2025 06:09:01 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/09/2025 06:09:01 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 06:09:03 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 06:09:19 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 06:09:19 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 06:09:38 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 06:09:41 - INFO - __main__ - Wan backbone parameters are frozen (requires_grad=False) +12/09/2025 06:09:41 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 06:09:41 - INFO - __main__ - Wan backbone is frozen (lr=0) +12/09/2025 06:09:41 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 06:09:41 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 06:09:50 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 06:09:50 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 06:09:50 - INFO - __main__ - Dataloader configuration: +12/09/2025 06:09:50 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 06:09:50 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 06:09:50 - INFO - __main__ - - persistent_workers: True +12/09/2025 06:09:50 - INFO - __main__ - - pin_memory: True +12/09/2025 06:09:50 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 06:11:12 - INFO - __main__ - ***** Running training ***** +12/09/2025 06:11:12 - INFO - __main__ - Num training steps = 10000 +12/09/2025 06:11:12 - INFO - __main__ - Instantaneous batch size per device = 2 +12/09/2025 06:11:12 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 06:11:12 - INFO - __main__ - Gradient Accumulation steps = 4 +12/09/2025 06:12:04 - INFO - __main__ - Step: 10 Loss: 11.1668 LR: 0.000000 +12/09/2025 06:12:43 - INFO - __main__ - Step: 20 Loss: 11.2193 LR: 0.000000 +12/09/2025 06:13:15 - INFO - __main__ - Step: 30 Loss: 11.2492 LR: 0.000000 +12/09/2025 06:13:53 - INFO - __main__ - Step: 40 Loss: 11.1764 LR: 0.000000 +12/09/2025 06:14:30 - INFO - __main__ - Step: 50 Loss: 11.2046 LR: 0.000000 +12/09/2025 06:15:02 - INFO - __main__ - Step: 60 Loss: 11.1739 LR: 0.000000 +12/09/2025 06:15:38 - INFO - __main__ - Step: 70 Loss: 11.1334 LR: 0.000000 +12/09/2025 06:16:15 - INFO - __main__ - Step: 80 Loss: 11.1807 LR: 0.000000 +12/09/2025 06:16:52 - INFO - __main__ - Step: 90 Loss: 11.1287 LR: 0.000000 +12/09/2025 06:17:27 - INFO - __main__ - Step: 100 Loss: 11.1015 LR: 0.000000 +12/09/2025 06:18:01 - INFO - __main__ - Step: 110 Loss: 11.1268 LR: 0.000000 +12/09/2025 06:18:35 - INFO - __main__ - Step: 120 Loss: 11.0750 LR: 0.000000 +12/09/2025 06:19:10 - INFO - __main__ - Step: 130 Loss: 11.0581 LR: 0.000000 +12/09/2025 06:19:43 - INFO - __main__ - Step: 140 Loss: 11.0381 LR: 0.000000 +12/09/2025 06:20:17 - INFO - __main__ - Step: 150 Loss: 11.0657 LR: 0.000000 +12/09/2025 06:20:51 - INFO - __main__ - Step: 160 Loss: 11.0320 LR: 0.000000 +12/09/2025 06:21:24 - INFO - __main__ - Step: 170 Loss: 11.0644 LR: 0.000000 +12/09/2025 06:21:57 - INFO - __main__ - Step: 180 Loss: 11.0552 LR: 0.000000 +12/09/2025 06:22:32 - INFO - __main__ - Step: 190 Loss: 10.9938 LR: 0.000000 +12/09/2025 06:23:09 - INFO - __main__ - Step: 200 Loss: 10.9979 LR: 0.000000 +12/09/2025 06:23:46 - INFO - __main__ - Step: 210 Loss: 10.9963 LR: 0.000000 +12/09/2025 06:24:21 - INFO - __main__ - Step: 220 Loss: 10.9833 LR: 0.000000 +12/09/2025 06:24:55 - INFO - __main__ - Step: 230 Loss: 10.9983 LR: 0.000000 +12/09/2025 06:25:29 - INFO - __main__ - Step: 240 Loss: 11.0060 LR: 0.000000 +12/09/2025 06:26:08 - INFO - __main__ - Step: 250 Loss: 10.9848 LR: 0.000000 +12/09/2025 06:26:42 - INFO - __main__ - Step: 260 Loss: 10.9357 LR: 0.000000 +12/09/2025 06:27:22 - INFO - __main__ - Step: 270 Loss: 10.9227 LR: 0.000000 +12/09/2025 06:27:58 - INFO - __main__ - Step: 280 Loss: 10.9936 LR: 0.000000 +12/09/2025 06:28:37 - INFO - __main__ - Step: 290 Loss: 10.9555 LR: 0.000000 +12/09/2025 06:29:13 - INFO - __main__ - Step: 300 Loss: 10.9174 LR: 0.000000 +12/09/2025 06:29:49 - INFO - __main__ - Step: 310 Loss: 10.9509 LR: 0.000000 +12/09/2025 06:30:25 - INFO - __main__ - Step: 320 Loss: 10.9304 LR: 0.000000 +12/09/2025 06:31:01 - INFO - __main__ - Step: 330 Loss: 10.9065 LR: 0.000000 +12/09/2025 06:31:37 - INFO - __main__ - Step: 340 Loss: 10.9084 LR: 0.000000 +12/09/2025 06:32:12 - INFO - __main__ - Step: 350 Loss: 11.0131 LR: 0.000000 +12/09/2025 06:32:48 - INFO - __main__ - Step: 360 Loss: 10.8504 LR: 0.000000 +12/09/2025 06:33:25 - INFO - __main__ - Step: 370 Loss: 10.8779 LR: 0.000000 +12/09/2025 06:34:01 - INFO - __main__ - Step: 380 Loss: 10.8450 LR: 0.000000 +12/09/2025 06:34:37 - INFO - __main__ - Step: 390 Loss: 10.8490 LR: 0.000000 +12/09/2025 06:35:14 - INFO - __main__ - Step: 400 Loss: 10.9187 LR: 0.000000 +12/09/2025 06:35:50 - INFO - __main__ - Step: 410 Loss: 10.8653 LR: 0.000000 +12/09/2025 06:36:26 - INFO - __main__ - Step: 420 Loss: 10.8252 LR: 0.000000 +12/09/2025 06:37:01 - INFO - __main__ - Step: 430 Loss: 10.8188 LR: 0.000000 +12/09/2025 06:37:37 - INFO - __main__ - Step: 440 Loss: 10.8241 LR: 0.000000 +12/09/2025 06:38:13 - INFO - __main__ - Step: 450 Loss: 10.8091 LR: 0.000000 +12/09/2025 06:38:48 - INFO - __main__ - Step: 460 Loss: 10.8458 LR: 0.000000 +12/09/2025 06:39:22 - INFO - __main__ - Step: 470 Loss: 10.8439 LR: 0.000000 +12/09/2025 06:39:57 - INFO - __main__ - Step: 480 Loss: 10.8652 LR: 0.000000 +12/09/2025 06:40:32 - INFO - __main__ - Step: 490 Loss: 10.9514 LR: 0.000000 +12/09/2025 06:41:07 - INFO - __main__ - Step: 500 Loss: 10.8374 LR: 0.000000 +12/09/2025 06:41:07 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500 +12/09/2025 06:41:13 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/optimizer.bin +12/09/2025 06:41:13 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/scheduler.bin +12/09/2025 06:41:13 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/sampler.bin +12/09/2025 06:41:13 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500/random_states_0.pkl +12/09/2025 06:41:13 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-500 +12/09/2025 06:41:13 - INFO - __main__ - Generating videos for validation... +12/09/2025 06:41:13 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.88it/s] +12/09/2025 06:41:20 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 06:41:56 - INFO - __main__ - Step: 510 Loss: 10.7682 LR: 0.000000 +12/09/2025 06:42:31 - INFO - __main__ - Step: 520 Loss: 10.8190 LR: 0.000000 +12/09/2025 06:43:07 - INFO - __main__ - Step: 530 Loss: 10.7900 LR: 0.000000 +12/09/2025 06:43:43 - INFO - __main__ - Step: 540 Loss: 10.7587 LR: 0.000000 +12/09/2025 06:44:19 - INFO - __main__ - Step: 550 Loss: 10.8122 LR: 0.000000 +12/09/2025 06:44:54 - INFO - __main__ - Step: 560 Loss: 10.7749 LR: 0.000000 +12/09/2025 06:45:30 - INFO - __main__ - Step: 570 Loss: 10.9821 LR: 0.000000 +12/09/2025 06:46:05 - INFO - __main__ - Step: 580 Loss: 10.7714 LR: 0.000000 +12/09/2025 06:46:42 - INFO - __main__ - Step: 590 Loss: 10.8758 LR: 0.000000 +12/09/2025 06:47:18 - INFO - __main__ - Step: 600 Loss: 10.8953 LR: 0.000000 +12/09/2025 06:47:53 - INFO - __main__ - Step: 610 Loss: 10.7866 LR: 0.000000 +12/09/2025 06:48:30 - INFO - __main__ - Step: 620 Loss: 10.7730 LR: 0.000000 +12/09/2025 06:49:06 - INFO - __main__ - Step: 630 Loss: 10.9168 LR: 0.000000 +12/09/2025 06:49:42 - INFO - __main__ - Step: 640 Loss: 10.7946 LR: 0.000000 +12/09/2025 06:50:18 - INFO - __main__ - Step: 650 Loss: 10.8732 LR: 0.000000 +12/09/2025 06:50:53 - INFO - __main__ - Step: 660 Loss: 10.8493 LR: 0.000000 +12/09/2025 06:51:29 - INFO - __main__ - Step: 670 Loss: 10.7979 LR: 0.000000 +12/09/2025 06:52:05 - INFO - __main__ - Step: 680 Loss: 10.7677 LR: 0.000000 +12/09/2025 06:52:41 - INFO - __main__ - Step: 690 Loss: 10.8272 LR: 0.000000 +12/09/2025 06:53:15 - INFO - __main__ - Step: 700 Loss: 10.7772 LR: 0.000000 +12/09/2025 06:53:50 - INFO - __main__ - Step: 710 Loss: 10.8874 LR: 0.000000 +12/09/2025 06:54:25 - INFO - __main__ - Step: 720 Loss: 10.7958 LR: 0.000000 +12/09/2025 06:54:59 - INFO - __main__ - Step: 730 Loss: 10.7957 LR: 0.000000 +12/09/2025 06:55:34 - INFO - __main__ - Step: 740 Loss: 10.7506 LR: 0.000000 +12/09/2025 06:56:09 - INFO - __main__ - Step: 750 Loss: 10.7045 LR: 0.000000 +12/09/2025 06:56:44 - INFO - __main__ - Step: 760 Loss: 10.7127 LR: 0.000000 +12/09/2025 06:57:19 - INFO - __main__ - Step: 770 Loss: 10.7613 LR: 0.000000 +12/09/2025 06:57:55 - INFO - __main__ - Step: 780 Loss: 10.7651 LR: 0.000000 +12/09/2025 06:58:30 - INFO - __main__ - Step: 790 Loss: 10.7413 LR: 0.000000 +12/09/2025 06:59:05 - INFO - __main__ - Step: 800 Loss: 10.6675 LR: 0.000000 +12/09/2025 06:59:39 - INFO - __main__ - Step: 810 Loss: 10.7217 LR: 0.000000 +12/09/2025 07:00:13 - INFO - __main__ - Step: 820 Loss: 10.9675 LR: 0.000000 +12/09/2025 07:00:48 - INFO - __main__ - Step: 830 Loss: 10.8523 LR: 0.000000 +12/09/2025 07:01:23 - INFO - __main__ - Step: 840 Loss: 10.8386 LR: 0.000000 +12/09/2025 07:01:57 - INFO - __main__ - Step: 850 Loss: 10.9333 LR: 0.000000 +12/09/2025 07:02:32 - INFO - __main__ - Step: 860 Loss: 10.8703 LR: 0.000000 +12/09/2025 07:03:08 - INFO - __main__ - Step: 870 Loss: 10.7036 LR: 0.000000 +12/09/2025 07:03:43 - INFO - __main__ - Step: 880 Loss: 10.7645 LR: 0.000000 +12/09/2025 07:04:17 - INFO - __main__ - Step: 890 Loss: 10.8497 LR: 0.000000 +12/09/2025 07:04:51 - INFO - __main__ - Step: 900 Loss: 10.7641 LR: 0.000000 +12/09/2025 07:05:26 - INFO - __main__ - Step: 910 Loss: 10.7548 LR: 0.000000 +12/09/2025 07:06:01 - INFO - __main__ - Step: 920 Loss: 10.6804 LR: 0.000000 +12/09/2025 07:06:37 - INFO - __main__ - Step: 930 Loss: 10.7162 LR: 0.000000 +12/09/2025 07:07:12 - INFO - __main__ - Step: 940 Loss: 10.8392 LR: 0.000000 +12/09/2025 07:07:48 - INFO - __main__ - Step: 950 Loss: 10.8436 LR: 0.000000 +12/09/2025 07:08:22 - INFO - __main__ - Step: 960 Loss: 10.6719 LR: 0.000000 +12/09/2025 07:08:57 - INFO - __main__ - Step: 970 Loss: 10.7326 LR: 0.000000 +12/09/2025 07:09:32 - INFO - __main__ - Step: 980 Loss: 10.7283 LR: 0.000000 +12/09/2025 07:10:07 - INFO - __main__ - Step: 990 Loss: 10.7694 LR: 0.000000 +12/09/2025 07:10:41 - INFO - __main__ - Step: 1000 Loss: 10.7278 LR: 0.000000 +12/09/2025 07:10:41 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000 +12/09/2025 07:10:46 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/optimizer.bin +12/09/2025 07:10:46 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/scheduler.bin +12/09/2025 07:10:46 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/sampler.bin +12/09/2025 07:10:46 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000/random_states_0.pkl +12/09/2025 07:10:46 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1000 +12/09/2025 07:10:46 - INFO - __main__ - Generating videos for validation... +12/09/2025 07:10:46 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:07<00:00, 6.16it/s] +12/09/2025 07:10:55 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 07:11:30 - INFO - __main__ - Step: 1010 Loss: 10.7099 LR: 0.000000 +12/09/2025 07:12:05 - INFO - __main__ - Step: 1020 Loss: 10.6899 LR: 0.000000 +12/09/2025 07:12:39 - INFO - __main__ - Step: 1030 Loss: 10.7427 LR: 0.000000 +12/09/2025 07:13:14 - INFO - __main__ - Step: 1040 Loss: 10.7364 LR: 0.000000 +12/09/2025 07:13:48 - INFO - __main__ - Step: 1050 Loss: 10.6964 LR: 0.000000 +12/09/2025 07:14:22 - INFO - __main__ - Step: 1060 Loss: 10.7274 LR: 0.000000 +12/09/2025 07:15:02 - INFO - __main__ - Step: 1070 Loss: 10.7467 LR: 0.000000 +12/09/2025 07:15:36 - INFO - __main__ - Step: 1080 Loss: 10.7183 LR: 0.000000 +12/09/2025 07:16:10 - INFO - __main__ - Step: 1090 Loss: 10.7970 LR: 0.000000 +12/09/2025 07:16:45 - INFO - __main__ - Step: 1100 Loss: 10.7245 LR: 0.000000 +12/09/2025 07:17:20 - INFO - __main__ - Step: 1110 Loss: 10.6785 LR: 0.000000 +12/09/2025 07:17:55 - INFO - __main__ - Step: 1120 Loss: 10.8533 LR: 0.000000 +12/09/2025 07:18:29 - INFO - __main__ - Step: 1130 Loss: 10.7524 LR: 0.000000 +12/09/2025 07:19:05 - INFO - __main__ - Step: 1140 Loss: 10.7386 LR: 0.000000 +12/09/2025 07:19:39 - INFO - __main__ - Step: 1150 Loss: 10.7314 LR: 0.000000 +12/09/2025 07:20:14 - INFO - __main__ - Step: 1160 Loss: 10.7112 LR: 0.000000 +12/09/2025 07:20:49 - INFO - __main__ - Step: 1170 Loss: 10.7142 LR: 0.000000 +12/09/2025 07:21:23 - INFO - __main__ - Step: 1180 Loss: 10.7207 LR: 0.000000 +12/09/2025 07:21:57 - INFO - __main__ - Step: 1190 Loss: 10.8444 LR: 0.000000 +12/09/2025 07:22:33 - INFO - __main__ - Step: 1200 Loss: 10.7064 LR: 0.000000 +12/09/2025 07:23:08 - INFO - __main__ - Step: 1210 Loss: 10.7500 LR: 0.000000 +12/09/2025 07:23:43 - INFO - __main__ - Step: 1220 Loss: 10.8374 LR: 0.000000 +12/09/2025 07:24:24 - INFO - __main__ - Step: 1230 Loss: 10.7911 LR: 0.000000 +12/09/2025 07:24:58 - INFO - __main__ - Step: 1240 Loss: 10.7271 LR: 0.000000 +12/09/2025 07:25:33 - INFO - __main__ - Step: 1250 Loss: 10.6836 LR: 0.000000 +12/09/2025 07:26:08 - INFO - __main__ - Step: 1260 Loss: 10.6580 LR: 0.000000 +12/09/2025 07:26:42 - INFO - __main__ - Step: 1270 Loss: 10.7364 LR: 0.000000 +12/09/2025 07:27:17 - INFO - __main__ - Step: 1280 Loss: 10.6940 LR: 0.000000 +12/09/2025 07:27:52 - INFO - __main__ - Step: 1290 Loss: 10.7192 LR: 0.000000 +12/09/2025 07:28:28 - INFO - __main__ - Step: 1300 Loss: 10.7082 LR: 0.000000 +12/09/2025 07:29:03 - INFO - __main__ - Step: 1310 Loss: 10.8125 LR: 0.000000 +12/09/2025 07:29:38 - INFO - __main__ - Step: 1320 Loss: 10.7506 LR: 0.000000 +12/09/2025 07:30:17 - INFO - __main__ - Step: 1330 Loss: 10.6634 LR: 0.000000 +12/09/2025 07:30:51 - INFO - __main__ - Step: 1340 Loss: 10.7885 LR: 0.000000 +12/09/2025 07:31:26 - INFO - __main__ - Step: 1350 Loss: 10.7047 LR: 0.000000 +12/09/2025 07:32:01 - INFO - __main__ - Step: 1360 Loss: 10.6698 LR: 0.000000 +12/09/2025 07:32:35 - INFO - __main__ - Step: 1370 Loss: 10.6433 LR: 0.000000 +12/09/2025 07:33:08 - INFO - __main__ - Step: 1380 Loss: 10.6759 LR: 0.000000 +12/09/2025 07:33:43 - INFO - __main__ - Step: 1390 Loss: 10.7195 LR: 0.000000 +12/09/2025 07:34:19 - INFO - __main__ - Step: 1400 Loss: 10.6907 LR: 0.000000 +12/09/2025 07:34:52 - INFO - __main__ - Step: 1410 Loss: 10.6880 LR: 0.000000 +12/09/2025 07:35:27 - INFO - __main__ - Step: 1420 Loss: 10.8639 LR: 0.000000 +12/09/2025 07:36:02 - INFO - __main__ - Step: 1430 Loss: 10.7094 LR: 0.000000 +12/09/2025 07:36:37 - INFO - __main__ - Step: 1440 Loss: 10.6651 LR: 0.000000 +12/09/2025 07:37:13 - INFO - __main__ - Step: 1450 Loss: 10.7197 LR: 0.000000 +12/09/2025 07:37:49 - INFO - __main__ - Step: 1460 Loss: 10.6842 LR: 0.000000 +12/09/2025 07:38:24 - INFO - __main__ - Step: 1470 Loss: 10.6803 LR: 0.000000 +12/09/2025 07:38:59 - INFO - __main__ - Step: 1480 Loss: 10.7731 LR: 0.000000 +12/09/2025 07:39:33 - INFO - __main__ - Step: 1490 Loss: 10.6577 LR: 0.000000 +12/09/2025 07:40:08 - INFO - __main__ - Step: 1500 Loss: 10.7049 LR: 0.000000 +12/09/2025 07:40:08 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500 +12/09/2025 07:40:14 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/optimizer.bin +12/09/2025 07:40:14 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/scheduler.bin +12/09/2025 07:40:14 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/sampler.bin +12/09/2025 07:40:14 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500/random_states_0.pkl +12/09/2025 07:40:14 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-1500 +12/09/2025 07:40:14 - INFO - __main__ - Generating videos for validation... +12/09/2025 07:40:14 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.72it/s] +12/09/2025 07:40:20 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 07:40:54 - INFO - __main__ - Step: 1510 Loss: 10.6579 LR: 0.000000 +12/09/2025 07:41:31 - INFO - __main__ - Step: 1520 Loss: 10.6986 LR: 0.000000 +12/09/2025 07:42:08 - INFO - __main__ - Step: 1530 Loss: 10.6443 LR: 0.000000 +12/09/2025 07:42:46 - INFO - __main__ - Step: 1540 Loss: 10.8676 LR: 0.000000 +12/09/2025 07:43:21 - INFO - __main__ - Step: 1550 Loss: 10.7138 LR: 0.000000 +12/09/2025 07:43:58 - INFO - __main__ - Step: 1560 Loss: 10.8056 LR: 0.000000 +12/09/2025 07:44:33 - INFO - __main__ - Step: 1570 Loss: 10.8255 LR: 0.000000 +12/09/2025 07:45:08 - INFO - __main__ - Step: 1580 Loss: 10.6627 LR: 0.000000 +12/09/2025 07:45:43 - INFO - __main__ - Step: 1590 Loss: 10.6622 LR: 0.000000 +12/09/2025 07:46:18 - INFO - __main__ - Step: 1600 Loss: 10.7134 LR: 0.000000 +12/09/2025 07:46:52 - INFO - __main__ - Step: 1610 Loss: 10.7862 LR: 0.000000 +12/09/2025 07:47:26 - INFO - __main__ - Step: 1620 Loss: 10.8276 LR: 0.000000 +12/09/2025 07:48:01 - INFO - __main__ - Step: 1630 Loss: 10.7237 LR: 0.000000 +12/09/2025 07:48:37 - INFO - __main__ - Step: 1640 Loss: 10.7666 LR: 0.000000 +12/09/2025 07:49:12 - INFO - __main__ - Step: 1650 Loss: 10.8079 LR: 0.000000 +12/09/2025 07:49:48 - INFO - __main__ - Step: 1660 Loss: 10.7356 LR: 0.000000 +12/09/2025 07:50:23 - INFO - __main__ - Step: 1670 Loss: 10.6761 LR: 0.000000 +12/09/2025 07:50:59 - INFO - __main__ - Step: 1680 Loss: 10.6437 LR: 0.000000 +12/09/2025 07:51:33 - INFO - __main__ - Step: 1690 Loss: 10.6769 LR: 0.000000 +12/09/2025 07:52:09 - INFO - __main__ - Step: 1700 Loss: 10.7399 LR: 0.000000 +12/09/2025 07:52:44 - INFO - __main__ - Step: 1710 Loss: 10.6194 LR: 0.000000 +12/09/2025 07:53:20 - INFO - __main__ - Step: 1720 Loss: 10.6745 LR: 0.000000 +12/09/2025 07:53:55 - INFO - __main__ - Step: 1730 Loss: 10.8154 LR: 0.000000 +12/09/2025 07:54:30 - INFO - __main__ - Step: 1740 Loss: 10.6674 LR: 0.000000 +12/09/2025 07:55:06 - INFO - __main__ - Step: 1750 Loss: 10.6357 LR: 0.000000 +12/09/2025 07:55:43 - INFO - __main__ - Step: 1760 Loss: 10.7442 LR: 0.000000 +12/09/2025 07:56:18 - INFO - __main__ - Step: 1770 Loss: 10.8313 LR: 0.000000 +12/09/2025 07:56:54 - INFO - __main__ - Step: 1780 Loss: 10.8105 LR: 0.000000 +12/09/2025 07:57:29 - INFO - __main__ - Step: 1790 Loss: 10.8072 LR: 0.000000 +12/09/2025 07:58:04 - INFO - __main__ - Step: 1800 Loss: 10.6736 LR: 0.000000 +12/09/2025 07:58:39 - INFO - __main__ - Step: 1810 Loss: 10.6609 LR: 0.000000 +12/09/2025 07:59:16 - INFO - __main__ - Step: 1820 Loss: 10.7257 LR: 0.000000 +12/09/2025 07:59:51 - INFO - __main__ - Step: 1830 Loss: 10.6906 LR: 0.000000 +12/09/2025 08:00:26 - INFO - __main__ - Step: 1840 Loss: 10.6100 LR: 0.000000 +12/09/2025 08:01:02 - INFO - __main__ - Step: 1850 Loss: 10.7107 LR: 0.000000 +12/09/2025 08:01:37 - INFO - __main__ - Step: 1860 Loss: 10.7994 LR: 0.000000 +12/09/2025 08:02:11 - INFO - __main__ - Step: 1870 Loss: 10.6406 LR: 0.000000 +12/09/2025 08:02:46 - INFO - __main__ - Step: 1880 Loss: 10.6008 LR: 0.000000 +12/09/2025 08:03:22 - INFO - __main__ - Step: 1890 Loss: 10.6390 LR: 0.000000 +12/09/2025 08:03:58 - INFO - __main__ - Step: 1900 Loss: 10.8459 LR: 0.000000 +12/09/2025 08:04:32 - INFO - __main__ - Step: 1910 Loss: 10.7128 LR: 0.000000 +12/09/2025 08:05:07 - INFO - __main__ - Step: 1920 Loss: 10.7213 LR: 0.000000 +12/09/2025 08:05:44 - INFO - __main__ - Step: 1930 Loss: 10.6535 LR: 0.000000 +12/09/2025 08:06:18 - INFO - __main__ - Step: 1940 Loss: 10.6877 LR: 0.000000 +12/09/2025 08:06:54 - INFO - __main__ - Step: 1950 Loss: 10.6792 LR: 0.000000 +12/09/2025 08:07:30 - INFO - __main__ - Step: 1960 Loss: 10.7321 LR: 0.000000 +12/09/2025 08:08:06 - INFO - __main__ - Step: 1970 Loss: 10.7119 LR: 0.000000 +12/09/2025 08:08:41 - INFO - __main__ - Step: 1980 Loss: 10.6359 LR: 0.000000 +12/09/2025 08:09:16 - INFO - __main__ - Step: 1990 Loss: 10.7073 LR: 0.000000 +12/09/2025 08:09:51 - INFO - __main__ - Step: 2000 Loss: 10.7035 LR: 0.000000 +12/09/2025 08:09:51 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000 +12/09/2025 08:09:55 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/optimizer.bin +12/09/2025 08:09:55 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/scheduler.bin +12/09/2025 08:09:55 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/sampler.bin +12/09/2025 08:09:55 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000/random_states_0.pkl +12/09/2025 08:09:55 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2000 +12/09/2025 08:09:55 - INFO - __main__ - Generating videos for validation... +12/09/2025 08:09:55 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.77it/s] +12/09/2025 08:10:02 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 08:10:36 - INFO - __main__ - Step: 2010 Loss: 10.8088 LR: 0.000000 +12/09/2025 08:11:11 - INFO - __main__ - Step: 2020 Loss: 10.6305 LR: 0.000000 +12/09/2025 08:11:47 - INFO - __main__ - Step: 2030 Loss: 10.8125 LR: 0.000000 +12/09/2025 08:12:22 - INFO - __main__ - Step: 2040 Loss: 10.8360 LR: 0.000000 +12/09/2025 08:12:58 - INFO - __main__ - Step: 2050 Loss: 10.7444 LR: 0.000000 +12/09/2025 08:13:33 - INFO - __main__ - Step: 2060 Loss: 10.8345 LR: 0.000000 +12/09/2025 08:14:09 - INFO - __main__ - Step: 2070 Loss: 10.7116 LR: 0.000000 +12/09/2025 08:14:44 - INFO - __main__ - Step: 2080 Loss: 10.7805 LR: 0.000000 +12/09/2025 08:15:20 - INFO - __main__ - Step: 2090 Loss: 10.7028 LR: 0.000000 +12/09/2025 08:15:55 - INFO - __main__ - Step: 2100 Loss: 10.6629 LR: 0.000000 +12/09/2025 08:16:29 - INFO - __main__ - Step: 2110 Loss: 10.6475 LR: 0.000000 +12/09/2025 08:17:06 - INFO - __main__ - Step: 2120 Loss: 10.7187 LR: 0.000000 +12/09/2025 08:17:41 - INFO - __main__ - Step: 2130 Loss: 10.6920 LR: 0.000000 +12/09/2025 08:18:15 - INFO - __main__ - Step: 2140 Loss: 10.6203 LR: 0.000000 +12/09/2025 08:18:51 - INFO - __main__ - Step: 2150 Loss: 10.8124 LR: 0.000000 +12/09/2025 08:19:27 - INFO - __main__ - Step: 2160 Loss: 10.6412 LR: 0.000000 +12/09/2025 08:20:02 - INFO - __main__ - Step: 2170 Loss: 10.6837 LR: 0.000000 +12/09/2025 08:20:37 - INFO - __main__ - Step: 2180 Loss: 10.7157 LR: 0.000000 +12/09/2025 08:21:12 - INFO - __main__ - Step: 2190 Loss: 10.6561 LR: 0.000000 +12/09/2025 08:21:48 - INFO - __main__ - Step: 2200 Loss: 10.6555 LR: 0.000000 +12/09/2025 08:22:24 - INFO - __main__ - Step: 2210 Loss: 10.6130 LR: 0.000000 +12/09/2025 08:23:00 - INFO - __main__ - Step: 2220 Loss: 10.6601 LR: 0.000000 +12/09/2025 08:23:37 - INFO - __main__ - Step: 2230 Loss: 10.7609 LR: 0.000000 +12/09/2025 08:24:13 - INFO - __main__ - Step: 2240 Loss: 10.6826 LR: 0.000000 +12/09/2025 08:24:50 - INFO - __main__ - Step: 2250 Loss: 10.6676 LR: 0.000000 +12/09/2025 08:25:26 - INFO - __main__ - Step: 2260 Loss: 10.6843 LR: 0.000000 +12/09/2025 08:26:01 - INFO - __main__ - Step: 2270 Loss: 10.6482 LR: 0.000000 +12/09/2025 08:26:37 - INFO - __main__ - Step: 2280 Loss: 10.7033 LR: 0.000000 +12/09/2025 08:27:13 - INFO - __main__ - Step: 2290 Loss: 10.6908 LR: 0.000000 +12/09/2025 08:27:47 - INFO - __main__ - Step: 2300 Loss: 10.7051 LR: 0.000000 +12/09/2025 08:28:24 - INFO - __main__ - Step: 2310 Loss: 10.6186 LR: 0.000000 +12/09/2025 08:28:59 - INFO - __main__ - Step: 2320 Loss: 10.6609 LR: 0.000000 +12/09/2025 08:29:35 - INFO - __main__ - Step: 2330 Loss: 10.6018 LR: 0.000000 +12/09/2025 08:30:11 - INFO - __main__ - Step: 2340 Loss: 10.6501 LR: 0.000000 +12/09/2025 08:30:50 - INFO - __main__ - Step: 2350 Loss: 10.7127 LR: 0.000000 +12/09/2025 08:31:26 - INFO - __main__ - Step: 2360 Loss: 10.5897 LR: 0.000000 +12/09/2025 08:32:02 - INFO - __main__ - Step: 2370 Loss: 10.6239 LR: 0.000000 +12/09/2025 08:32:39 - INFO - __main__ - Step: 2380 Loss: 10.8009 LR: 0.000000 +12/09/2025 08:33:17 - INFO - __main__ - Step: 2390 Loss: 10.6583 LR: 0.000000 +12/09/2025 08:33:52 - INFO - __main__ - Step: 2400 Loss: 10.7898 LR: 0.000000 +12/09/2025 08:34:29 - INFO - __main__ - Step: 2410 Loss: 10.7053 LR: 0.000000 +12/09/2025 08:35:04 - INFO - __main__ - Step: 2420 Loss: 10.7878 LR: 0.000000 +12/09/2025 08:35:40 - INFO - __main__ - Step: 2430 Loss: 10.7829 LR: 0.000000 +12/09/2025 08:36:16 - INFO - __main__ - Step: 2440 Loss: 10.5827 LR: 0.000000 +12/09/2025 08:36:52 - INFO - __main__ - Step: 2450 Loss: 10.6923 LR: 0.000000 +12/09/2025 08:37:27 - INFO - __main__ - Step: 2460 Loss: 10.7444 LR: 0.000000 +12/09/2025 08:38:03 - INFO - __main__ - Step: 2470 Loss: 10.8070 LR: 0.000000 +12/09/2025 08:38:37 - INFO - __main__ - Step: 2480 Loss: 10.6172 LR: 0.000000 +12/09/2025 08:39:13 - INFO - __main__ - Step: 2490 Loss: 10.7115 LR: 0.000000 +12/09/2025 08:39:49 - INFO - __main__ - Step: 2500 Loss: 10.6463 LR: 0.000000 +12/09/2025 08:39:49 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500 +12/09/2025 08:39:54 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/optimizer.bin +12/09/2025 08:39:54 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/scheduler.bin +12/09/2025 08:39:54 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/sampler.bin +12/09/2025 08:39:54 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500/random_states_0.pkl +12/09/2025 08:39:54 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-2500 +12/09/2025 08:39:54 - INFO - __main__ - Generating videos for validation... +12/09/2025 08:39:54 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.79it/s] +12/09/2025 08:40:00 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 08:40:35 - INFO - __main__ - Step: 2510 Loss: 10.7087 LR: 0.000000 +12/09/2025 08:41:13 - INFO - __main__ - Step: 2520 Loss: 10.6353 LR: 0.000000 +12/09/2025 08:41:50 - INFO - __main__ - Step: 2530 Loss: 10.7710 LR: 0.000000 +12/09/2025 08:42:30 - INFO - __main__ - Step: 2540 Loss: 10.7668 LR: 0.000000 +12/09/2025 08:43:09 - INFO - __main__ - Step: 2550 Loss: 10.8079 LR: 0.000000 +12/09/2025 08:43:46 - INFO - __main__ - Step: 2560 Loss: 10.6643 LR: 0.000000 +12/09/2025 08:44:22 - INFO - __main__ - Step: 2570 Loss: 10.5756 LR: 0.000000 +12/09/2025 08:44:58 - INFO - __main__ - Step: 2580 Loss: 10.6465 LR: 0.000000 +12/09/2025 08:45:35 - INFO - __main__ - Step: 2590 Loss: 10.6285 LR: 0.000000 +12/09/2025 08:46:10 - INFO - __main__ - Step: 2600 Loss: 10.6634 LR: 0.000000 +12/09/2025 08:46:46 - INFO - __main__ - Step: 2610 Loss: 10.7108 LR: 0.000000 +12/09/2025 08:47:22 - INFO - __main__ - Step: 2620 Loss: 10.6576 LR: 0.000000 +12/09/2025 08:47:57 - INFO - __main__ - Step: 2630 Loss: 10.6358 LR: 0.000000 +12/09/2025 08:48:33 - INFO - __main__ - Step: 2640 Loss: 10.7581 LR: 0.000000 +12/09/2025 08:49:09 - INFO - __main__ - Step: 2650 Loss: 10.6845 LR: 0.000000 +12/09/2025 08:49:45 - INFO - __main__ - Step: 2660 Loss: 10.6936 LR: 0.000000 +12/09/2025 08:50:21 - INFO - __main__ - Step: 2670 Loss: 10.6546 LR: 0.000000 +12/09/2025 08:50:58 - INFO - __main__ - Step: 2680 Loss: 10.6183 LR: 0.000000 +12/09/2025 08:51:32 - INFO - __main__ - Step: 2690 Loss: 10.7849 LR: 0.000000 +12/09/2025 08:52:08 - INFO - __main__ - Step: 2700 Loss: 10.7991 LR: 0.000000 +12/09/2025 08:52:44 - INFO - __main__ - Step: 2710 Loss: 10.6829 LR: 0.000000 +12/09/2025 08:53:19 - INFO - __main__ - Step: 2720 Loss: 10.6621 LR: 0.000000 +12/09/2025 08:53:54 - INFO - __main__ - Step: 2730 Loss: 10.7058 LR: 0.000000 +12/09/2025 08:54:30 - INFO - __main__ - Step: 2740 Loss: 10.6618 LR: 0.000000 +12/09/2025 08:55:06 - INFO - __main__ - Step: 2750 Loss: 10.6734 LR: 0.000000 +12/09/2025 08:55:42 - INFO - __main__ - Step: 2760 Loss: 10.6153 LR: 0.000000 +12/09/2025 08:56:18 - INFO - __main__ - Step: 2770 Loss: 10.6321 LR: 0.000000 +12/09/2025 08:56:53 - INFO - __main__ - Step: 2780 Loss: 10.5745 LR: 0.000000 +12/09/2025 08:57:28 - INFO - __main__ - Step: 2790 Loss: 10.6596 LR: 0.000000 +12/09/2025 08:58:04 - INFO - __main__ - Step: 2800 Loss: 10.6588 LR: 0.000000 +12/09/2025 08:58:39 - INFO - __main__ - Step: 2810 Loss: 10.6601 LR: 0.000000 +12/09/2025 08:59:15 - INFO - __main__ - Step: 2820 Loss: 10.7636 LR: 0.000000 +12/09/2025 08:59:50 - INFO - __main__ - Step: 2830 Loss: 10.6506 LR: 0.000000 +12/09/2025 09:00:26 - INFO - __main__ - Step: 2840 Loss: 10.6805 LR: 0.000000 +12/09/2025 09:01:02 - INFO - __main__ - Step: 2850 Loss: 10.6614 LR: 0.000000 +12/09/2025 09:01:38 - INFO - __main__ - Step: 2860 Loss: 10.7792 LR: 0.000000 +12/09/2025 09:02:13 - INFO - __main__ - Step: 2870 Loss: 10.6542 LR: 0.000000 +12/09/2025 09:02:49 - INFO - __main__ - Step: 2880 Loss: 10.6708 LR: 0.000000 +12/09/2025 09:03:25 - INFO - __main__ - Step: 2890 Loss: 10.7061 LR: 0.000000 +12/09/2025 09:04:00 - INFO - __main__ - Step: 2900 Loss: 10.6127 LR: 0.000000 +12/09/2025 09:04:35 - INFO - __main__ - Step: 2910 Loss: 10.6518 LR: 0.000000 +12/09/2025 09:05:12 - INFO - __main__ - Step: 2920 Loss: 10.6581 LR: 0.000000 +12/09/2025 09:05:48 - INFO - __main__ - Step: 2930 Loss: 10.6383 LR: 0.000000 +12/09/2025 09:06:24 - INFO - __main__ - Step: 2940 Loss: 10.8066 LR: 0.000000 +12/09/2025 09:07:01 - INFO - __main__ - Step: 2950 Loss: 10.7880 LR: 0.000000 +12/09/2025 09:07:36 - INFO - __main__ - Step: 2960 Loss: 10.5884 LR: 0.000000 +12/09/2025 09:08:11 - INFO - __main__ - Step: 2970 Loss: 10.7773 LR: 0.000000 +12/09/2025 09:08:48 - INFO - __main__ - Step: 2980 Loss: 10.6863 LR: 0.000000 +12/09/2025 09:09:24 - INFO - __main__ - Step: 2990 Loss: 10.8002 LR: 0.000000 +12/09/2025 09:10:00 - INFO - __main__ - Step: 3000 Loss: 10.7093 LR: 0.000000 +12/09/2025 09:10:00 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000 +12/09/2025 09:10:04 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/optimizer.bin +12/09/2025 09:10:04 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/scheduler.bin +12/09/2025 09:10:04 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/sampler.bin +12/09/2025 09:10:04 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000/random_states_0.pkl +12/09/2025 09:10:04 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3000 +12/09/2025 09:10:04 - INFO - __main__ - Generating videos for validation... +12/09/2025 09:10:04 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.02it/s] +12/09/2025 09:10:10 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 09:10:46 - INFO - __main__ - Step: 3010 Loss: 10.6042 LR: 0.000000 +12/09/2025 09:11:22 - INFO - __main__ - Step: 3020 Loss: 10.7013 LR: 0.000000 +12/09/2025 09:11:58 - INFO - __main__ - Step: 3030 Loss: 10.7090 LR: 0.000000 +12/09/2025 09:12:33 - INFO - __main__ - Step: 3040 Loss: 10.6076 LR: 0.000000 +12/09/2025 09:13:09 - INFO - __main__ - Step: 3050 Loss: 10.6399 LR: 0.000000 +12/09/2025 09:13:45 - INFO - __main__ - Step: 3060 Loss: 10.6489 LR: 0.000000 +12/09/2025 09:14:19 - INFO - __main__ - Step: 3070 Loss: 10.6214 LR: 0.000000 +12/09/2025 09:14:55 - INFO - __main__ - Step: 3080 Loss: 10.7581 LR: 0.000000 +12/09/2025 09:15:31 - INFO - __main__ - Step: 3090 Loss: 10.6609 LR: 0.000000 +12/09/2025 09:16:06 - INFO - __main__ - Step: 3100 Loss: 10.6468 LR: 0.000000 +12/09/2025 09:16:41 - INFO - __main__ - Step: 3110 Loss: 10.6854 LR: 0.000000 +12/09/2025 09:17:26 - INFO - __main__ - Step: 3120 Loss: 10.6509 LR: 0.000000 +12/09/2025 09:18:02 - INFO - __main__ - Step: 3130 Loss: 10.6653 LR: 0.000000 +12/09/2025 09:18:37 - INFO - __main__ - Step: 3140 Loss: 10.6436 LR: 0.000000 +12/09/2025 09:19:14 - INFO - __main__ - Step: 3150 Loss: 10.8747 LR: 0.000000 +12/09/2025 09:19:49 - INFO - __main__ - Step: 3160 Loss: 10.7215 LR: 0.000000 +12/09/2025 09:20:24 - INFO - __main__ - Step: 3170 Loss: 10.6346 LR: 0.000000 +12/09/2025 09:20:59 - INFO - __main__ - Step: 3180 Loss: 10.6490 LR: 0.000000 +12/09/2025 09:21:36 - INFO - __main__ - Step: 3190 Loss: 10.6399 LR: 0.000000 +12/09/2025 09:22:13 - INFO - __main__ - Step: 3200 Loss: 10.6877 LR: 0.000000 +12/09/2025 09:22:49 - INFO - __main__ - Step: 3210 Loss: 10.8075 LR: 0.000000 +12/09/2025 09:23:24 - INFO - __main__ - Step: 3220 Loss: 10.6954 LR: 0.000000 +12/09/2025 09:24:00 - INFO - __main__ - Step: 3230 Loss: 10.6205 LR: 0.000000 +12/09/2025 09:24:36 - INFO - __main__ - Step: 3240 Loss: 10.6941 LR: 0.000000 +12/09/2025 09:25:11 - INFO - __main__ - Step: 3250 Loss: 10.5495 LR: 0.000000 +12/09/2025 09:25:47 - INFO - __main__ - Step: 3260 Loss: 10.7130 LR: 0.000000 +12/09/2025 09:26:22 - INFO - __main__ - Step: 3270 Loss: 10.6809 LR: 0.000000 +12/09/2025 09:26:59 - INFO - __main__ - Step: 3280 Loss: 10.6429 LR: 0.000000 +12/09/2025 09:27:34 - INFO - __main__ - Step: 3290 Loss: 10.7921 LR: 0.000000 +12/09/2025 09:28:10 - INFO - __main__ - Step: 3300 Loss: 10.6165 LR: 0.000000 +12/09/2025 09:28:44 - INFO - __main__ - Step: 3310 Loss: 10.7733 LR: 0.000000 +12/09/2025 09:29:19 - INFO - __main__ - Step: 3320 Loss: 10.7044 LR: 0.000000 +12/09/2025 09:29:54 - INFO - __main__ - Step: 3330 Loss: 10.6457 LR: 0.000000 +12/09/2025 09:30:30 - INFO - __main__ - Step: 3340 Loss: 10.8392 LR: 0.000000 +12/09/2025 09:31:05 - INFO - __main__ - Step: 3350 Loss: 10.7154 LR: 0.000000 +12/09/2025 09:31:40 - INFO - __main__ - Step: 3360 Loss: 10.6910 LR: 0.000000 +12/09/2025 09:32:16 - INFO - __main__ - Step: 3370 Loss: 10.5989 LR: 0.000000 +12/09/2025 09:32:52 - INFO - __main__ - Step: 3380 Loss: 10.6108 LR: 0.000000 +12/09/2025 09:33:28 - INFO - __main__ - Step: 3390 Loss: 10.6229 LR: 0.000000 +12/09/2025 09:34:04 - INFO - __main__ - Step: 3400 Loss: 10.6160 LR: 0.000000 +12/09/2025 09:34:40 - INFO - __main__ - Step: 3410 Loss: 10.6124 LR: 0.000000 +12/09/2025 09:35:14 - INFO - __main__ - Step: 3420 Loss: 10.6181 LR: 0.000000 +12/09/2025 09:35:53 - INFO - __main__ - Step: 3430 Loss: 10.7191 LR: 0.000000 +12/09/2025 09:36:28 - INFO - __main__ - Step: 3440 Loss: 10.7020 LR: 0.000000 +12/09/2025 09:37:04 - INFO - __main__ - Step: 3450 Loss: 10.6007 LR: 0.000000 +12/09/2025 09:37:39 - INFO - __main__ - Step: 3460 Loss: 10.6844 LR: 0.000000 +12/09/2025 09:38:16 - INFO - __main__ - Step: 3470 Loss: 10.6536 LR: 0.000000 +12/09/2025 09:38:51 - INFO - __main__ - Step: 3480 Loss: 10.6183 LR: 0.000000 +12/09/2025 09:39:28 - INFO - __main__ - Step: 3490 Loss: 10.6050 LR: 0.000000 +12/09/2025 09:40:03 - INFO - __main__ - Step: 3500 Loss: 10.6128 LR: 0.000000 +12/09/2025 09:40:03 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500 +12/09/2025 09:40:09 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/optimizer.bin +12/09/2025 09:40:09 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/scheduler.bin +12/09/2025 09:40:09 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/sampler.bin +12/09/2025 09:40:09 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500/random_states_0.pkl +12/09/2025 09:40:09 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-3500 +12/09/2025 09:40:09 - INFO - __main__ - Generating videos for validation... +12/09/2025 09:40:09 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.72it/s] +12/09/2025 09:40:16 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 09:40:51 - INFO - __main__ - Step: 3510 Loss: 10.8156 LR: 0.000000 +12/09/2025 09:41:27 - INFO - __main__ - Step: 3520 Loss: 10.6784 LR: 0.000000 +12/09/2025 09:42:03 - INFO - __main__ - Step: 3530 Loss: 10.7747 LR: 0.000000 +12/09/2025 09:42:40 - INFO - __main__ - Step: 3540 Loss: 10.6523 LR: 0.000000 +12/09/2025 09:43:16 - INFO - __main__ - Step: 3550 Loss: 10.6710 LR: 0.000000 +12/09/2025 09:43:51 - INFO - __main__ - Step: 3560 Loss: 10.6228 LR: 0.000000 +12/09/2025 09:44:25 - INFO - __main__ - Step: 3570 Loss: 10.6325 LR: 0.000000 +12/09/2025 09:45:01 - INFO - __main__ - Step: 3580 Loss: 10.6580 LR: 0.000000 +12/09/2025 09:45:35 - INFO - __main__ - Step: 3590 Loss: 10.6412 LR: 0.000000 +12/09/2025 09:46:10 - INFO - __main__ - Step: 3600 Loss: 10.6338 LR: 0.000000 +12/09/2025 09:46:46 - INFO - __main__ - Step: 3610 Loss: 10.6233 LR: 0.000000 +12/09/2025 09:47:21 - INFO - __main__ - Step: 3620 Loss: 10.7960 LR: 0.000000 +12/09/2025 09:47:57 - INFO - __main__ - Step: 3630 Loss: 10.6824 LR: 0.000000 +12/09/2025 09:48:33 - INFO - __main__ - Step: 3640 Loss: 10.5719 LR: 0.000000 +12/09/2025 09:49:08 - INFO - __main__ - Step: 3650 Loss: 10.7161 LR: 0.000000 +12/09/2025 09:49:43 - INFO - __main__ - Step: 3660 Loss: 10.5788 LR: 0.000000 +12/09/2025 09:50:18 - INFO - __main__ - Step: 3670 Loss: 10.5937 LR: 0.000000 +12/09/2025 09:50:57 - INFO - __main__ - Step: 3680 Loss: 10.6459 LR: 0.000000 +12/09/2025 09:51:36 - INFO - __main__ - Step: 3690 Loss: 10.6017 LR: 0.000000 +12/09/2025 09:52:12 - INFO - __main__ - Step: 3700 Loss: 10.6180 LR: 0.000000 +12/09/2025 09:52:47 - INFO - __main__ - Step: 3710 Loss: 10.7492 LR: 0.000000 +12/09/2025 09:53:25 - INFO - __main__ - Step: 3720 Loss: 10.6616 LR: 0.000000 +12/09/2025 09:54:02 - INFO - __main__ - Step: 3730 Loss: 10.5907 LR: 0.000000 +12/09/2025 09:54:37 - INFO - __main__ - Step: 3740 Loss: 10.6316 LR: 0.000000 +12/09/2025 09:55:15 - INFO - __main__ - Step: 3750 Loss: 10.5829 LR: 0.000000 +12/09/2025 09:55:50 - INFO - __main__ - Step: 3760 Loss: 10.8051 LR: 0.000000 +12/09/2025 09:56:26 - INFO - __main__ - Step: 3770 Loss: 10.6096 LR: 0.000000 +12/09/2025 09:57:02 - INFO - __main__ - Step: 3780 Loss: 10.6737 LR: 0.000000 +12/09/2025 09:57:38 - INFO - __main__ - Step: 3790 Loss: 10.6407 LR: 0.000000 +12/09/2025 09:58:14 - INFO - __main__ - Step: 3800 Loss: 10.6684 LR: 0.000000 +12/09/2025 09:58:51 - INFO - __main__ - Step: 3810 Loss: 10.6076 LR: 0.000000 +12/09/2025 09:59:26 - INFO - __main__ - Step: 3820 Loss: 10.5907 LR: 0.000000 +12/09/2025 10:00:00 - INFO - __main__ - Step: 3830 Loss: 10.7516 LR: 0.000000 +12/09/2025 10:00:38 - INFO - __main__ - Step: 3840 Loss: 10.6090 LR: 0.000000 +12/09/2025 10:01:13 - INFO - __main__ - Step: 3850 Loss: 10.6274 LR: 0.000000 +12/09/2025 10:01:48 - INFO - __main__ - Step: 3860 Loss: 10.7927 LR: 0.000000 +12/09/2025 10:02:23 - INFO - __main__ - Step: 3870 Loss: 10.5754 LR: 0.000000 +12/09/2025 10:02:58 - INFO - __main__ - Step: 3880 Loss: 10.6764 LR: 0.000000 +12/09/2025 10:03:34 - INFO - __main__ - Step: 3890 Loss: 10.5462 LR: 0.000000 +12/09/2025 10:04:10 - INFO - __main__ - Step: 3900 Loss: 10.5982 LR: 0.000000 +12/09/2025 10:04:48 - INFO - __main__ - Step: 3910 Loss: 10.6223 LR: 0.000000 +12/09/2025 10:05:24 - INFO - __main__ - Step: 3920 Loss: 10.7905 LR: 0.000000 +12/09/2025 10:05:59 - INFO - __main__ - Step: 3930 Loss: 10.6241 LR: 0.000000 +12/09/2025 10:06:35 - INFO - __main__ - Step: 3940 Loss: 10.7700 LR: 0.000000 +12/09/2025 10:07:09 - INFO - __main__ - Step: 3950 Loss: 10.6646 LR: 0.000000 +12/09/2025 10:07:44 - INFO - __main__ - Step: 3960 Loss: 10.7494 LR: 0.000000 +12/09/2025 10:08:19 - INFO - __main__ - Step: 3970 Loss: 10.6112 LR: 0.000000 +12/09/2025 10:08:56 - INFO - __main__ - Step: 3980 Loss: 10.7726 LR: 0.000000 +12/09/2025 10:09:31 - INFO - __main__ - Step: 3990 Loss: 10.6219 LR: 0.000000 +12/09/2025 10:10:07 - INFO - __main__ - Step: 4000 Loss: 10.5863 LR: 0.000000 +12/09/2025 10:10:07 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 +12/09/2025 10:10:12 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/optimizer.bin +12/09/2025 10:10:12 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/scheduler.bin +12/09/2025 10:10:12 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/sampler.bin +12/09/2025 10:10:12 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000/random_states_0.pkl +12/09/2025 10:10:12 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 +12/09/2025 10:10:12 - INFO - __main__ - Generating videos for validation... +12/09/2025 10:10:12 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.39it/s] +12/09/2025 10:10:19 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio +12/09/2025 10:10:54 - INFO - __main__ - Step: 4010 Loss: 10.6506 LR: 0.000000 +12/09/2025 10:11:31 - INFO - __main__ - Step: 4020 Loss: 10.6645 LR: 0.000000 +12/09/2025 10:12:10 - INFO - __main__ - Step: 4030 Loss: 10.6335 LR: 0.000000 +12/09/2025 10:12:50 - INFO - __main__ - Step: 4040 Loss: 10.6103 LR: 0.000000 +12/09/2025 10:13:29 - INFO - __main__ - Step: 4050 Loss: 10.6712 LR: 0.000000 +12/09/2025 10:14:03 - INFO - __main__ - Step: 4060 Loss: 10.7147 LR: 0.000000 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1435, in + frames_tensor = torch.stack(frames, dim=0) + File "/mnt/Meissonic/train/train_mei_video.py", line 1283, in main + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1435, in +[rank0]: frames_tensor = torch.stack(frames, dim=0) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1283, in main +[rank0]: +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/requirements.txt b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..882ab777b3727eb58f6fc36980670267858e57b6 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/files/wandb-metadata.json @@ -0,0 +1,154 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T06:08:56.508700Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--freeze_wan_backbone", + "--wan_backbone_lr_ratio", + "0.0", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12017912905728" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "bs5hxou77f8nhtr968pz9al3r4zobq2y" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-core.log b/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9d6d9f3bbaebbc8a53a9a1e5442dc4f9afb7e85a --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T06:08:56.58235045Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmptuqk2li8/port-461528.txt","pid":461528,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T06:08:56.582820205Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":461528} +{"time":"2025-12-09T06:08:56.582800746Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-461528-461798-2163184360/socket","Net":"unix"}} +{"time":"2025-12-09T06:08:56.764351024Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T06:08:56.770261851Z","level":"INFO","msg":"handleInformInit: received","streamId":"ctbp97lz","id":"1(@)"} +{"time":"2025-12-09T06:08:56.937028325Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ctbp97lz","id":"1(@)"} +{"time":"2025-12-09T10:14:12.178908066Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-internal.log b/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..694f446ebdd8cafb2d0747848cd9c87ab021f02e --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T06:08:56.770384765Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T06:08:56.936708556Z","level":"INFO","msg":"stream: created new stream","id":"ctbp97lz"} +{"time":"2025-12-09T06:08:56.936930986Z","level":"INFO","msg":"handler: started","stream_id":"ctbp97lz"} +{"time":"2025-12-09T06:08:56.937020175Z","level":"INFO","msg":"stream: started","id":"ctbp97lz"} +{"time":"2025-12-09T06:08:56.937037269Z","level":"INFO","msg":"writer: started","stream_id":"ctbp97lz"} +{"time":"2025-12-09T06:08:56.937040368Z","level":"INFO","msg":"sender: started","stream_id":"ctbp97lz"} diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug.log b/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1f240f4310a8b47ce8509c33dcfe86b049f871b4 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_setup.py:_flush():80] Configure stats pid to 461528 +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug.log +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_060856-ctbp97lz/logs/debug-internal.log +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_init.py:init():841] calling init triggers +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 06:08:56,511 INFO MainThread:461528 [wandb_init.py:init():889] starting backend +2025-12-09 06:08:56,764 INFO MainThread:461528 [wandb_init.py:init():892] sending inform_init request +2025-12-09 06:08:56,768 INFO MainThread:461528 [wandb_init.py:init():900] backend started and connected +2025-12-09 06:08:56,769 INFO MainThread:461528 [wandb_init.py:init():970] updated telemetry +2025-12-09 06:08:56,774 INFO MainThread:461528 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 06:08:57,287 INFO MainThread:461528 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 06:08:57,409 INFO MainThread:461528 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 06:08:57,409 INFO MainThread:461528 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 06:08:57,410 INFO MainThread:461528 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 06:08:57,410 INFO MainThread:461528 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 06:08:57,412 INFO MainThread:461528 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 06:08:57,413 INFO MainThread:461528 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': True, 'wan_backbone_lr_ratio': 0.0} diff --git a/Meissonic/wandb/run-20251209_060856-ctbp97lz/run-ctbp97lz.wandb b/Meissonic/wandb/run-20251209_060856-ctbp97lz/run-ctbp97lz.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e86288104af59198dac2fa71e0d0a105a8f0cc90 --- /dev/null +++ b/Meissonic/wandb/run-20251209_060856-ctbp97lz/run-ctbp97lz.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1decb264c3facb98478fd342c9a75ef40f7059408c46241f8c0612470d79f01 +size 3866624 diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/files/output.log b/Meissonic/wandb/run-20251209_101545-18h777jw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a3a63d61674542da2271205d5d01afe33a22cbfc --- /dev/null +++ b/Meissonic/wandb/run-20251209_101545-18h777jw/files/output.log @@ -0,0 +1,159 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 71.91it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 10:15:49 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6916.40it/s] +12/09/2025 10:15:50 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/09/2025 10:15:50 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/09/2025 10:15:50 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 10:15:51 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 10:16:07 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 10:16:07 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 10:16:25 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 10:16:28 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 10:16:28 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 10:16:28 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 10:16:28 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 10:16:36 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 10:16:36 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 10:16:36 - INFO - __main__ - Dataloader configuration: +12/09/2025 10:16:36 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 10:16:36 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 10:16:36 - INFO - __main__ - - persistent_workers: True +12/09/2025 10:16:36 - INFO - __main__ - - pin_memory: True +12/09/2025 10:16:36 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 10:16:56 - INFO - __main__ - ***** Running training ***** +12/09/2025 10:16:56 - INFO - __main__ - Num training steps = 10000 +12/09/2025 10:16:56 - INFO - __main__ - Instantaneous batch size per device = 2 +12/09/2025 10:16:56 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 10:16:56 - INFO - __main__ - Gradient Accumulation steps = 4 +12/09/2025 10:17:46 - INFO - __main__ - Step: 10 Loss: 11.0743 LR: 0.000300 +12/09/2025 10:18:24 - INFO - __main__ - Step: 20 Loss: 11.0744 LR: 0.000300 +12/09/2025 10:19:00 - INFO - __main__ - Step: 30 Loss: 11.0724 LR: 0.000300 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1297, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 358, in forward + q=rope_apply(q, grid_sizes, freqs), + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 241, in rope_apply + x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape( +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1297, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 358, in forward +[rank0]: q=rope_apply(q, grid_sizes, freqs), +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 241, in rope_apply +[rank0]: x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape( +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/files/requirements.txt b/Meissonic/wandb/run-20251209_101545-18h777jw/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_101545-18h777jw/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_101545-18h777jw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..436a09ffa610442cb17cace43f23817fd7c5e738 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101545-18h777jw/files/wandb-metadata.json @@ -0,0 +1,151 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T10:15:45.646078Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12040636145664" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "njobkvrswapd2mjdxwfj2z2e0j34qdah" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-core.log b/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..aacddadd55054c67fecc60ad574c2bfc5b8d6196 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T10:15:45.717897393Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpq5k3n3kj/port-1710831.txt","pid":1710831,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T10:15:45.718349495Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1710831} +{"time":"2025-12-09T10:15:45.718359886Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1710831-1711023-3076955892/socket","Net":"unix"}} +{"time":"2025-12-09T10:15:45.900357571Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T10:15:45.906355067Z","level":"INFO","msg":"handleInformInit: received","streamId":"18h777jw","id":"1(@)"} +{"time":"2025-12-09T10:15:46.074035273Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"18h777jw","id":"1(@)"} +{"time":"2025-12-09T10:19:13.502015816Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-internal.log b/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..db621149eeaa4432cfffbedb866c25834feec27c --- /dev/null +++ b/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T10:15:45.906609658Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T10:15:46.07379782Z","level":"INFO","msg":"stream: created new stream","id":"18h777jw"} +{"time":"2025-12-09T10:15:46.073887373Z","level":"INFO","msg":"handler: started","stream_id":"18h777jw"} +{"time":"2025-12-09T10:15:46.074025424Z","level":"INFO","msg":"stream: started","id":"18h777jw"} +{"time":"2025-12-09T10:15:46.074035157Z","level":"INFO","msg":"writer: started","stream_id":"18h777jw"} +{"time":"2025-12-09T10:15:46.074044896Z","level":"INFO","msg":"sender: started","stream_id":"18h777jw"} diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug.log b/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..772f0769535dec5a54ec9b63ca1eb6cbe4d96da7 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_setup.py:_flush():80] Configure stats pid to 1710831 +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug.log +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_101545-18h777jw/logs/debug-internal.log +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_init.py:init():841] calling init triggers +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 10:15:45,648 INFO MainThread:1710831 [wandb_init.py:init():889] starting backend +2025-12-09 10:15:45,900 INFO MainThread:1710831 [wandb_init.py:init():892] sending inform_init request +2025-12-09 10:15:45,904 INFO MainThread:1710831 [wandb_init.py:init():900] backend started and connected +2025-12-09 10:15:45,906 INFO MainThread:1710831 [wandb_init.py:init():970] updated telemetry +2025-12-09 10:15:45,910 INFO MainThread:1710831 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 10:15:46,338 INFO MainThread:1710831 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 10:15:46,464 INFO MainThread:1710831 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 10:15:46,464 INFO MainThread:1710831 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 10:15:46,464 INFO MainThread:1710831 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 10:15:46,464 INFO MainThread:1710831 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 10:15:46,467 INFO MainThread:1710831 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 10:15:46,468 INFO MainThread:1710831 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251209_101545-18h777jw/run-18h777jw.wandb b/Meissonic/wandb/run-20251209_101545-18h777jw/run-18h777jw.wandb new file mode 100644 index 0000000000000000000000000000000000000000..186c17f506ee84f0803218a8aa7165f1cb0073c5 Binary files /dev/null and b/Meissonic/wandb/run-20251209_101545-18h777jw/run-18h777jw.wandb differ diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/config.yaml b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26aa17fc964b5c177c7bddec762f1468d1f4a278 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/config.yaml @@ -0,0 +1,294 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 0oe17z8ryn1a3t71a4cnr7g1rc6j1axk: + args: + - --text_encoder_architecture + - umt5-xxl + - --pretrained_model_name_or_path + - /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "2" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12040636317696" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T10:19:40.689127Z" + writerId: 0oe17z8ryn1a3t71a4cnr7g1rc6j1axk + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: null +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue +pretrained_model_name_or_path: + value: /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 2 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: null diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/output.log b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..31e265b1898d77fd80d12d03cb6197f1c16a43d7 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/output.log @@ -0,0 +1,51 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 69.33it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 10:19:44 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6137.15it/s] +12/09/2025 10:19:45 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/09/2025 10:19:45 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/09/2025 10:20:13 - INFO - __main__ - Parameter counts: backbone=2,191,878,208, other=2,112,033, total=2,193,990,241 +12/09/2025 10:20:13 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 10:20:13 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 10:20:13 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 10:20:20 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 10:20:20 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 10:20:20 - INFO - __main__ - Dataloader configuration: +12/09/2025 10:20:20 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 10:20:20 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 10:20:20 - INFO - __main__ - - persistent_workers: True +12/09/2025 10:20:20 - INFO - __main__ - - pin_memory: True +12/09/2025 10:20:20 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 10:20:41 - INFO - __main__ - ***** Running training ***** +12/09/2025 10:20:41 - INFO - __main__ - Num training steps = 10000 +12/09/2025 10:20:41 - INFO - __main__ - Instantaneous batch size per device = 2 +12/09/2025 10:20:41 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 10:20:41 - INFO - __main__ - Gradient Accumulation steps = 4 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1333, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.71 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.52 GiB is free. Process 1765386 has 414.00 MiB memory in use. Process 1765384 has 414.00 MiB memory in use. Process 1765387 has 414.00 MiB memory in use. Process 1765390 has 414.00 MiB memory in use. Process 1765389 has 414.00 MiB memory in use. Process 1765388 has 414.00 MiB memory in use. Process 1765385 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 35.09 GiB memory in use. Of the allocated memory 28.22 GiB is allocated by PyTorch, and 5.77 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1333, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.71 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.52 GiB is free. Process 1765386 has 414.00 MiB memory in use. Process 1765384 has 414.00 MiB memory in use. Process 1765387 has 414.00 MiB memory in use. Process 1765390 has 414.00 MiB memory in use. Process 1765389 has 414.00 MiB memory in use. Process 1765388 has 414.00 MiB memory in use. Process 1765385 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 35.09 GiB memory in use. Of the allocated memory 28.22 GiB is allocated by PyTorch, and 5.77 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/requirements.txt b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2edfad17a102c08ca6d1383807c4b36e62c7e600 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T10:19:40.689127Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12040636317696" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "0oe17z8ryn1a3t71a4cnr7g1rc6j1axk" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/wandb-summary.json b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..43fa43b2931fdff41b1f87a0ba7176f1f1bdf2eb --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":113},"_runtime":113} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-core.log b/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..5fdefcaa4d324afc11a0a0b55a85e389252babc8 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T10:19:40.761887998Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpsrhk7a3_/port-1765383.txt","pid":1765383,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T10:19:40.762361725Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1765383} +{"time":"2025-12-09T10:19:40.762355039Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1765383-1765647-3454298689/socket","Net":"unix"}} +{"time":"2025-12-09T10:19:40.946446197Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T10:19:40.952049688Z","level":"INFO","msg":"handleInformInit: received","streamId":"6eo06e2t","id":"1(@)"} +{"time":"2025-12-09T10:19:41.121266057Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"6eo06e2t","id":"1(@)"} +{"time":"2025-12-09T10:21:34.921702441Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T10:21:34.921773597Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T10:21:34.92176219Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T10:21:34.921874142Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1765383-1765647-3454298689/socket","Net":"unix"}} +{"time":"2025-12-09T10:21:34.922140875Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T10:21:35.28891892Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T10:21:35.28895153Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T10:21:35.288969226Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-internal.log b/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..06bafdde6f66949c297c1927382867d292922437 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T10:19:40.952144615Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T10:19:41.121052434Z","level":"INFO","msg":"stream: created new stream","id":"6eo06e2t"} +{"time":"2025-12-09T10:19:41.121134879Z","level":"INFO","msg":"handler: started","stream_id":"6eo06e2t"} +{"time":"2025-12-09T10:19:41.121257683Z","level":"INFO","msg":"stream: started","id":"6eo06e2t"} +{"time":"2025-12-09T10:19:41.121282803Z","level":"INFO","msg":"writer: started","stream_id":"6eo06e2t"} +{"time":"2025-12-09T10:19:41.121283706Z","level":"INFO","msg":"sender: started","stream_id":"6eo06e2t"} +{"time":"2025-12-09T10:21:34.921769871Z","level":"INFO","msg":"stream: closing","id":"6eo06e2t"} +{"time":"2025-12-09T10:21:35.156164304Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T10:21:35.285709569Z","level":"INFO","msg":"handler: closed","stream_id":"6eo06e2t"} +{"time":"2025-12-09T10:21:35.28582027Z","level":"INFO","msg":"sender: closed","stream_id":"6eo06e2t"} +{"time":"2025-12-09T10:21:35.285838151Z","level":"INFO","msg":"stream: closed","id":"6eo06e2t"} diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug.log b/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f6f53434a61a60ca6f77b5d78b834a107a0bbda5 --- /dev/null +++ b/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 10:19:40,691 INFO MainThread:1765383 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 10:19:40,691 INFO MainThread:1765383 [wandb_setup.py:_flush():80] Configure stats pid to 1765383 +2025-12-09 10:19:40,691 INFO MainThread:1765383 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 10:19:40,691 INFO MainThread:1765383 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 10:19:40,691 INFO MainThread:1765383 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 10:19:40,692 INFO MainThread:1765383 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug.log +2025-12-09 10:19:40,692 INFO MainThread:1765383 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_101940-6eo06e2t/logs/debug-internal.log +2025-12-09 10:19:40,692 INFO MainThread:1765383 [wandb_init.py:init():841] calling init triggers +2025-12-09 10:19:40,692 INFO MainThread:1765383 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 10:19:40,692 INFO MainThread:1765383 [wandb_init.py:init():889] starting backend +2025-12-09 10:19:40,946 INFO MainThread:1765383 [wandb_init.py:init():892] sending inform_init request +2025-12-09 10:19:40,950 INFO MainThread:1765383 [wandb_init.py:init():900] backend started and connected +2025-12-09 10:19:40,951 INFO MainThread:1765383 [wandb_init.py:init():970] updated telemetry +2025-12-09 10:19:40,956 INFO MainThread:1765383 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 10:19:41,378 INFO MainThread:1765383 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 10:19:41,503 INFO MainThread:1765383 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 10:19:41,503 INFO MainThread:1765383 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 10:19:41,503 INFO MainThread:1765383 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 10:19:41,503 INFO MainThread:1765383 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 10:19:41,506 INFO MainThread:1765383 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 10:19:41,507 INFO MainThread:1765383 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': None, 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': False, 'features_dir': None} +2025-12-09 10:21:34,921 INFO wandb-AsyncioManager-main:1765383 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 10:21:34,922 INFO wandb-AsyncioManager-main:1765383 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_101940-6eo06e2t/run-6eo06e2t.wandb b/Meissonic/wandb/run-20251209_101940-6eo06e2t/run-6eo06e2t.wandb new file mode 100644 index 0000000000000000000000000000000000000000..3cfe92bd40c2585a42e441f8c31e8ce2d293c1e2 Binary files /dev/null and b/Meissonic/wandb/run-20251209_101940-6eo06e2t/run-6eo06e2t.wandb differ diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/output.log b/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9da8de2d4a396986ebb07ff4856956f84b4144a8 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/output.log @@ -0,0 +1,25 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 71.63it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 10:22:38 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7013.89it/s] +12/09/2025 10:22:40 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/09/2025 10:22:40 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/09/2025 10:23:07 - INFO - __main__ - Parameter counts: backbone=2,191,878,208, other=2,112,033, total=2,193,990,241 +12/09/2025 10:23:07 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 10:23:07 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 10:23:07 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 10:23:14 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 10:23:14 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 10:23:14 - INFO - __main__ - Dataloader configuration: +12/09/2025 10:23:14 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 10:23:14 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 10:23:14 - INFO - __main__ - - persistent_workers: True +12/09/2025 10:23:14 - INFO - __main__ - - pin_memory: True +12/09/2025 10:23:14 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 10:23:35 - INFO - __main__ - ***** Running training ***** +12/09/2025 10:23:35 - INFO - __main__ - Num training steps = 10000 +12/09/2025 10:23:35 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 10:23:35 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 10:23:35 - INFO - __main__ - Gradient Accumulation steps = 8 +12/09/2025 10:24:53 - INFO - __main__ - Step: 10 Loss: 11.0763 LR: 0.000300 +12/09/2025 10:26:06 - INFO - __main__ - Step: 20 Loss: 11.0723 LR: 0.000300 diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/requirements.txt b/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f052f5f95fcd556d74968520f21cf5d88fb7b3d0 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102235-95y0vlh3/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T10:22:35.233134Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "8", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12040636452864" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "yzvcufsb52wtryxz5lwk657v5el3944m" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-core.log b/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f95d409c2b51c30023dd2e7355572fbcf6a02336 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T10:22:35.303223801Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpx7g2yu4c/port-1775616.txt","pid":1775616,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T10:22:35.303750614Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1775616} +{"time":"2025-12-09T10:22:35.303730014Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1775616-1775889-726218309/socket","Net":"unix"}} +{"time":"2025-12-09T10:22:35.487936562Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T10:22:35.494099777Z","level":"INFO","msg":"handleInformInit: received","streamId":"95y0vlh3","id":"1(@)"} +{"time":"2025-12-09T10:22:35.66364009Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"95y0vlh3","id":"1(@)"} +{"time":"2025-12-09T10:26:32.909805593Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-internal.log b/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..59ed6746f079f29f00dd0347a6a90d640cf5e059 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T10:22:35.494245676Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T10:22:35.663435247Z","level":"INFO","msg":"stream: created new stream","id":"95y0vlh3"} +{"time":"2025-12-09T10:22:35.663518064Z","level":"INFO","msg":"handler: started","stream_id":"95y0vlh3"} +{"time":"2025-12-09T10:22:35.663633531Z","level":"INFO","msg":"stream: started","id":"95y0vlh3"} +{"time":"2025-12-09T10:22:35.663654716Z","level":"INFO","msg":"writer: started","stream_id":"95y0vlh3"} +{"time":"2025-12-09T10:22:35.663655985Z","level":"INFO","msg":"sender: started","stream_id":"95y0vlh3"} diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug.log b/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0ff20246283d0b92a338f23a01d862e89369c6fe --- /dev/null +++ b/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 10:22:35,235 INFO MainThread:1775616 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 10:22:35,235 INFO MainThread:1775616 [wandb_setup.py:_flush():80] Configure stats pid to 1775616 +2025-12-09 10:22:35,235 INFO MainThread:1775616 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 10:22:35,235 INFO MainThread:1775616 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 10:22:35,235 INFO MainThread:1775616 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 10:22:35,236 INFO MainThread:1775616 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug.log +2025-12-09 10:22:35,236 INFO MainThread:1775616 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_102235-95y0vlh3/logs/debug-internal.log +2025-12-09 10:22:35,236 INFO MainThread:1775616 [wandb_init.py:init():841] calling init triggers +2025-12-09 10:22:35,236 INFO MainThread:1775616 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 10:22:35,236 INFO MainThread:1775616 [wandb_init.py:init():889] starting backend +2025-12-09 10:22:35,488 INFO MainThread:1775616 [wandb_init.py:init():892] sending inform_init request +2025-12-09 10:22:35,492 INFO MainThread:1775616 [wandb_init.py:init():900] backend started and connected +2025-12-09 10:22:35,493 INFO MainThread:1775616 [wandb_init.py:init():970] updated telemetry +2025-12-09 10:22:35,498 INFO MainThread:1775616 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 10:22:35,983 INFO MainThread:1775616 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 10:22:36,107 INFO MainThread:1775616 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 10:22:36,107 INFO MainThread:1775616 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 10:22:36,107 INFO MainThread:1775616 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 10:22:36,107 INFO MainThread:1775616 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 10:22:36,109 INFO MainThread:1775616 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 10:22:36,111 INFO MainThread:1775616 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': None, 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251209_102235-95y0vlh3/run-95y0vlh3.wandb b/Meissonic/wandb/run-20251209_102235-95y0vlh3/run-95y0vlh3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..c1f02f0abb2ea15448b6ab628ab3b8d1cf68963b Binary files /dev/null and b/Meissonic/wandb/run-20251209_102235-95y0vlh3/run-95y0vlh3.wandb differ diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_2a764e89458c3c8d15fb.png b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_2a764e89458c3c8d15fb.png new file mode 100644 index 0000000000000000000000000000000000000000..67fd9ce0e135bc0a8d01b659eee676a6b43c8e9e --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_2a764e89458c3c8d15fb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a764e89458c3c8d15fb1b993214628df564089d2d2d0b6e6f304b5042a5b4c2 +size 153669 diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_80cf7f467b6a4ea9a5d4.png b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_80cf7f467b6a4ea9a5d4.png new file mode 100644 index 0000000000000000000000000000000000000000..c9280a89b8e3b30354f265404ad27523c27e05e9 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_80cf7f467b6a4ea9a5d4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80cf7f467b6a4ea9a5d477c4008984777d1b6b4422f00b945cdb847edeadabaa +size 143871 diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_c5788a4747c03f8f9f49.png b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_c5788a4747c03f8f9f49.png new file mode 100644 index 0000000000000000000000000000000000000000..deed70824d03d140b7743b739ff085d9ed8a85c6 Binary files /dev/null and b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_c5788a4747c03f8f9f49.png differ diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_fb32391d5c492e093a1a.png b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_fb32391d5c492e093a1a.png new file mode 100644 index 0000000000000000000000000000000000000000..aa9f132b4e3203ba9739eb46384a8de251dd3b29 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_fb32391d5c492e093a1a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb32391d5c492e093a1a5181dfed7a57173d650f41a8937045043a6654a4584d +size 144814 diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_9a388a1a15b60d9f4438.png b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_9a388a1a15b60d9f4438.png new file mode 100644 index 0000000000000000000000000000000000000000..996ffc4309be4eb10301a052e03e5dd8f070cf42 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_9a388a1a15b60d9f4438.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a388a1a15b60d9f44386c6a939bac94001713fd8b9c2583f78b4338bab330ea +size 120118 diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_c2c619bff47ae122a524.png b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_c2c619bff47ae122a524.png new file mode 100644 index 0000000000000000000000000000000000000000..6491dea77af7d8a9416908347e2761164abe4181 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_c2c619bff47ae122a524.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c619bff47ae122a5247b1528f8a40c85a112141a65dadbc62886b3a9abcb83 +size 127496 diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/output.log b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8b65910600fefd2d8cc88466ac214a4411944776 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/output.log @@ -0,0 +1,234 @@ +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 70.39it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 10:26:54 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 12531.00it/s] +12/09/2025 10:27:00 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 10:27:00 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 10:27:00 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 10:27:00 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 10:27:08 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 10:27:08 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 10:27:08 - INFO - __main__ - Dataloader configuration: +12/09/2025 10:27:08 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 10:27:08 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 10:27:08 - INFO - __main__ - - persistent_workers: True +12/09/2025 10:27:08 - INFO - __main__ - - pin_memory: True +12/09/2025 10:27:08 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 10:27:26 - INFO - __main__ - ***** Running training ***** +12/09/2025 10:27:26 - INFO - __main__ - Num training steps = 10000 +12/09/2025 10:27:26 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 10:27:26 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 10:27:26 - INFO - __main__ - Gradient Accumulation steps = 8 +12/09/2025 10:28:31 - INFO - __main__ - Step: 10 Loss: 10.5756 LR: 0.000300 +12/09/2025 10:29:29 - INFO - __main__ - Step: 20 Loss: 10.5985 LR: 0.000300 +12/09/2025 10:30:30 - INFO - __main__ - Step: 30 Loss: 10.5381 LR: 0.000300 +12/09/2025 10:31:31 - INFO - __main__ - Step: 40 Loss: 10.5820 LR: 0.000300 +12/09/2025 10:32:30 - INFO - __main__ - Step: 50 Loss: 10.5307 LR: 0.000300 +12/09/2025 10:33:29 - INFO - __main__ - Step: 60 Loss: 10.5077 LR: 0.000300 +12/09/2025 10:34:30 - INFO - __main__ - Step: 70 Loss: 10.5897 LR: 0.000300 +12/09/2025 10:35:28 - INFO - __main__ - Step: 80 Loss: 10.5997 LR: 0.000300 +12/09/2025 10:36:25 - INFO - __main__ - Step: 90 Loss: 10.4886 LR: 0.000300 +12/09/2025 10:37:23 - INFO - __main__ - Step: 100 Loss: 10.5784 LR: 0.000300 +12/09/2025 10:38:22 - INFO - __main__ - Step: 110 Loss: 10.4489 LR: 0.000300 +12/09/2025 10:39:19 - INFO - __main__ - Step: 120 Loss: 10.4976 LR: 0.000300 +12/09/2025 10:40:18 - INFO - __main__ - Step: 130 Loss: 10.5232 LR: 0.000300 +12/09/2025 10:41:19 - INFO - __main__ - Step: 140 Loss: 10.5564 LR: 0.000300 +12/09/2025 10:42:22 - INFO - __main__ - Step: 150 Loss: 10.5033 LR: 0.000300 +12/09/2025 10:43:24 - INFO - __main__ - Step: 160 Loss: 10.6194 LR: 0.000300 +12/09/2025 10:44:26 - INFO - __main__ - Step: 170 Loss: 10.5627 LR: 0.000300 +12/09/2025 10:45:28 - INFO - __main__ - Step: 180 Loss: 10.5360 LR: 0.000300 +12/09/2025 10:46:31 - INFO - __main__ - Step: 190 Loss: 10.5164 LR: 0.000300 +12/09/2025 10:47:33 - INFO - __main__ - Step: 200 Loss: 10.4800 LR: 0.000300 +12/09/2025 10:48:35 - INFO - __main__ - Step: 210 Loss: 10.5856 LR: 0.000300 +12/09/2025 10:49:37 - INFO - __main__ - Step: 220 Loss: 10.5771 LR: 0.000300 +12/09/2025 10:50:38 - INFO - __main__ - Step: 230 Loss: 10.5694 LR: 0.000300 +12/09/2025 10:51:39 - INFO - __main__ - Step: 240 Loss: 10.5486 LR: 0.000300 +12/09/2025 10:52:39 - INFO - __main__ - Step: 250 Loss: 10.5517 LR: 0.000300 +12/09/2025 10:53:41 - INFO - __main__ - Step: 260 Loss: 10.4185 LR: 0.000300 +12/09/2025 10:54:45 - INFO - __main__ - Step: 270 Loss: 10.4816 LR: 0.000300 +12/09/2025 10:55:45 - INFO - __main__ - Step: 280 Loss: 10.5435 LR: 0.000300 +12/09/2025 10:56:46 - INFO - __main__ - Step: 290 Loss: 10.5429 LR: 0.000300 +12/09/2025 10:57:49 - INFO - __main__ - Step: 300 Loss: 10.5335 LR: 0.000300 +12/09/2025 10:58:49 - INFO - __main__ - Step: 310 Loss: 10.5825 LR: 0.000300 +12/09/2025 10:59:51 - INFO - __main__ - Step: 320 Loss: 10.5503 LR: 0.000300 +12/09/2025 11:00:52 - INFO - __main__ - Step: 330 Loss: 10.5399 LR: 0.000300 +12/09/2025 11:01:52 - INFO - __main__ - Step: 340 Loss: 10.4736 LR: 0.000300 +12/09/2025 11:02:52 - INFO - __main__ - Step: 350 Loss: 10.5815 LR: 0.000300 +12/09/2025 11:03:57 - INFO - __main__ - Step: 360 Loss: 10.4822 LR: 0.000300 +12/09/2025 11:05:01 - INFO - __main__ - Step: 370 Loss: 10.5418 LR: 0.000300 +12/09/2025 11:06:02 - INFO - __main__ - Step: 380 Loss: 10.4685 LR: 0.000300 +12/09/2025 11:07:04 - INFO - __main__ - Step: 390 Loss: 10.5073 LR: 0.000300 +12/09/2025 11:08:06 - INFO - __main__ - Step: 400 Loss: 10.5048 LR: 0.000300 +12/09/2025 11:09:07 - INFO - __main__ - Step: 410 Loss: 10.4811 LR: 0.000300 +12/09/2025 11:10:09 - INFO - __main__ - Step: 420 Loss: 10.4874 LR: 0.000300 +12/09/2025 11:11:10 - INFO - __main__ - Step: 430 Loss: 10.5045 LR: 0.000300 +12/09/2025 11:12:12 - INFO - __main__ - Step: 440 Loss: 10.4710 LR: 0.000300 +12/09/2025 11:13:13 - INFO - __main__ - Step: 450 Loss: 10.5233 LR: 0.000300 +12/09/2025 11:14:13 - INFO - __main__ - Step: 460 Loss: 10.4441 LR: 0.000300 +12/09/2025 11:15:13 - INFO - __main__ - Step: 470 Loss: 10.4439 LR: 0.000300 +12/09/2025 11:16:13 - INFO - __main__ - Step: 480 Loss: 10.5054 LR: 0.000300 +12/09/2025 11:17:12 - INFO - __main__ - Step: 490 Loss: 10.3729 LR: 0.000300 +12/09/2025 11:18:12 - INFO - __main__ - Step: 500 Loss: 10.5167 LR: 0.000300 +12/09/2025 11:18:12 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500 +12/09/2025 11:18:20 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/optimizer.bin +12/09/2025 11:18:20 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/scheduler.bin +12/09/2025 11:18:20 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/sampler.bin +12/09/2025 11:18:20 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500/random_states_0.pkl +12/09/2025 11:18:20 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-500 +12/09/2025 11:18:20 - INFO - __main__ - Generating videos for validation... +12/09/2025 11:18:20 - INFO - __main__ - Generating videos for validation... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.59it/s] +12/09/2025 11:18:27 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue +12/09/2025 11:19:28 - INFO - __main__ - Step: 510 Loss: 10.4568 LR: 0.000300 +12/09/2025 11:20:28 - INFO - __main__ - Step: 520 Loss: 10.4816 LR: 0.000300 +12/09/2025 11:21:29 - INFO - __main__ - Step: 530 Loss: 10.3487 LR: 0.000300 +12/09/2025 11:22:29 - INFO - __main__ - Step: 540 Loss: 10.5037 LR: 0.000300 +12/09/2025 11:23:30 - INFO - __main__ - Step: 550 Loss: 10.5109 LR: 0.000300 +12/09/2025 11:24:34 - INFO - __main__ - Step: 560 Loss: 10.4070 LR: 0.000300 +12/09/2025 11:25:36 - INFO - __main__ - Step: 570 Loss: 10.3687 LR: 0.000300 +12/09/2025 11:26:40 - INFO - __main__ - Step: 580 Loss: 10.4315 LR: 0.000300 +12/09/2025 11:27:41 - INFO - __main__ - Step: 590 Loss: 10.5524 LR: 0.000300 +12/09/2025 11:28:43 - INFO - __main__ - Step: 600 Loss: 10.4376 LR: 0.000300 +12/09/2025 11:29:45 - INFO - __main__ - Step: 610 Loss: 10.4982 LR: 0.000300 +12/09/2025 11:30:44 - INFO - __main__ - Step: 620 Loss: 10.5766 LR: 0.000300 +12/09/2025 11:31:46 - INFO - __main__ - Step: 630 Loss: 10.4860 LR: 0.000300 +12/09/2025 11:32:48 - INFO - __main__ - Step: 640 Loss: 10.3650 LR: 0.000300 +12/09/2025 11:33:50 - INFO - __main__ - Step: 650 Loss: 10.3143 LR: 0.000300 +12/09/2025 11:34:51 - INFO - __main__ - Step: 660 Loss: 10.4826 LR: 0.000300 +12/09/2025 11:35:52 - INFO - __main__ - Step: 670 Loss: 10.5079 LR: 0.000300 +12/09/2025 11:36:54 - INFO - __main__ - Step: 680 Loss: 10.4871 LR: 0.000300 +12/09/2025 11:37:55 - INFO - __main__ - Step: 690 Loss: 10.3807 LR: 0.000300 +12/09/2025 11:38:56 - INFO - __main__ - Step: 700 Loss: 10.5205 LR: 0.000300 +12/09/2025 11:39:59 - INFO - __main__ - Step: 710 Loss: 10.5428 LR: 0.000300 +12/09/2025 11:40:59 - INFO - __main__ - Step: 720 Loss: 10.3462 LR: 0.000300 +12/09/2025 11:41:57 - INFO - __main__ - Step: 730 Loss: 10.4895 LR: 0.000300 +12/09/2025 11:42:59 - INFO - __main__ - Step: 740 Loss: 10.4331 LR: 0.000300 +12/09/2025 11:44:02 - INFO - __main__ - Step: 750 Loss: 10.4456 LR: 0.000300 +12/09/2025 11:45:04 - INFO - __main__ - Step: 760 Loss: 10.4432 LR: 0.000300 +12/09/2025 11:46:05 - INFO - __main__ - Step: 770 Loss: 10.4319 LR: 0.000300 +12/09/2025 11:47:05 - INFO - __main__ - Step: 780 Loss: 10.3427 LR: 0.000300 +12/09/2025 11:48:05 - INFO - __main__ - Step: 790 Loss: 10.5008 LR: 0.000300 +12/09/2025 11:49:04 - INFO - __main__ - Step: 800 Loss: 10.4130 LR: 0.000300 +12/09/2025 11:50:04 - INFO - __main__ - Step: 810 Loss: 10.3861 LR: 0.000300 +12/09/2025 11:51:06 - INFO - __main__ - Step: 820 Loss: 10.4996 LR: 0.000300 +12/09/2025 11:52:07 - INFO - __main__ - Step: 830 Loss: 10.4474 LR: 0.000300 +12/09/2025 11:53:09 - INFO - __main__ - Step: 840 Loss: 10.2460 LR: 0.000300 +12/09/2025 11:54:08 - INFO - __main__ - Step: 850 Loss: 10.4203 LR: 0.000300 +12/09/2025 11:55:09 - INFO - __main__ - Step: 860 Loss: 10.3336 LR: 0.000300 +12/09/2025 11:56:11 - INFO - __main__ - Step: 870 Loss: 10.4718 LR: 0.000300 +12/09/2025 11:57:12 - INFO - __main__ - Step: 880 Loss: 10.3754 LR: 0.000300 +12/09/2025 11:58:15 - INFO - __main__ - Step: 890 Loss: 10.4541 LR: 0.000300 +12/09/2025 11:59:16 - INFO - __main__ - Step: 900 Loss: 10.4114 LR: 0.000300 +12/09/2025 12:00:15 - INFO - __main__ - Step: 910 Loss: 10.4875 LR: 0.000300 +12/09/2025 12:01:17 - INFO - __main__ - Step: 920 Loss: 10.4626 LR: 0.000300 +12/09/2025 12:02:19 - INFO - __main__ - Step: 930 Loss: 10.4343 LR: 0.000300 +12/09/2025 12:03:19 - INFO - __main__ - Step: 940 Loss: 10.4718 LR: 0.000300 +12/09/2025 12:04:20 - INFO - __main__ - Step: 950 Loss: 10.4602 LR: 0.000300 +12/09/2025 12:05:20 - INFO - __main__ - Step: 960 Loss: 10.3744 LR: 0.000300 +12/09/2025 12:06:23 - INFO - __main__ - Step: 970 Loss: 10.3601 LR: 0.000300 +12/09/2025 12:07:23 - INFO - __main__ - Step: 980 Loss: 10.4573 LR: 0.000300 +12/09/2025 12:08:24 - INFO - __main__ - Step: 990 Loss: 10.5020 LR: 0.000300 +12/09/2025 12:09:28 - INFO - __main__ - Step: 1000 Loss: 10.3293 LR: 0.000300 +12/09/2025 12:09:28 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000 +12/09/2025 12:09:37 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/optimizer.bin +12/09/2025 12:09:37 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/scheduler.bin +12/09/2025 12:09:37 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/sampler.bin +12/09/2025 12:09:37 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000/random_states_0.pkl +12/09/2025 12:09:37 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1000 +12/09/2025 12:09:37 - INFO - __main__ - Generating videos for validation... +12/09/2025 12:09:37 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.80it/s] +12/09/2025 12:09:45 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue +12/09/2025 12:10:47 - INFO - __main__ - Step: 1010 Loss: 10.2537 LR: 0.000300 +12/09/2025 12:11:48 - INFO - __main__ - Step: 1020 Loss: 10.3517 LR: 0.000300 +12/09/2025 12:12:46 - INFO - __main__ - Step: 1030 Loss: 10.3686 LR: 0.000300 +12/09/2025 12:13:48 - INFO - __main__ - Step: 1040 Loss: 10.3712 LR: 0.000300 +12/09/2025 12:14:48 - INFO - __main__ - Step: 1050 Loss: 10.4094 LR: 0.000300 +12/09/2025 12:15:48 - INFO - __main__ - Step: 1060 Loss: 10.4581 LR: 0.000300 +12/09/2025 12:16:49 - INFO - __main__ - Step: 1070 Loss: 10.4886 LR: 0.000300 +12/09/2025 12:17:49 - INFO - __main__ - Step: 1080 Loss: 10.3462 LR: 0.000300 +12/09/2025 12:18:50 - INFO - __main__ - Step: 1090 Loss: 10.4209 LR: 0.000300 +12/09/2025 12:19:49 - INFO - __main__ - Step: 1100 Loss: 10.4281 LR: 0.000300 +12/09/2025 12:20:49 - INFO - __main__ - Step: 1110 Loss: 10.4686 LR: 0.000300 +12/09/2025 12:21:51 - INFO - __main__ - Step: 1120 Loss: 10.4089 LR: 0.000300 +12/09/2025 12:22:52 - INFO - __main__ - Step: 1130 Loss: 10.3998 LR: 0.000300 +12/09/2025 12:23:52 - INFO - __main__ - Step: 1140 Loss: 10.3321 LR: 0.000300 +12/09/2025 12:24:54 - INFO - __main__ - Step: 1150 Loss: 10.2142 LR: 0.000300 +12/09/2025 12:25:53 - INFO - __main__ - Step: 1160 Loss: 10.4853 LR: 0.000300 +12/09/2025 12:26:50 - INFO - __main__ - Step: 1170 Loss: 10.5006 LR: 0.000300 +12/09/2025 12:27:49 - INFO - __main__ - Step: 1180 Loss: 10.4385 LR: 0.000300 +12/09/2025 12:28:48 - INFO - __main__ - Step: 1190 Loss: 10.1816 LR: 0.000300 +12/09/2025 12:29:48 - INFO - __main__ - Step: 1200 Loss: 10.3925 LR: 0.000300 +12/09/2025 12:30:48 - INFO - __main__ - Step: 1210 Loss: 10.2949 LR: 0.000300 +12/09/2025 12:31:46 - INFO - __main__ - Step: 1220 Loss: 10.3461 LR: 0.000300 +12/09/2025 12:32:45 - INFO - __main__ - Step: 1230 Loss: 10.4405 LR: 0.000300 +12/09/2025 12:33:44 - INFO - __main__ - Step: 1240 Loss: 10.3677 LR: 0.000300 +12/09/2025 12:34:42 - INFO - __main__ - Step: 1250 Loss: 10.4093 LR: 0.000300 +12/09/2025 12:35:42 - INFO - __main__ - Step: 1260 Loss: 10.4217 LR: 0.000300 +12/09/2025 12:36:42 - INFO - __main__ - Step: 1270 Loss: 10.4606 LR: 0.000300 +12/09/2025 12:37:42 - INFO - __main__ - Step: 1280 Loss: 10.4633 LR: 0.000300 +12/09/2025 12:38:40 - INFO - __main__ - Step: 1290 Loss: 10.3848 LR: 0.000300 +12/09/2025 12:39:38 - INFO - __main__ - Step: 1300 Loss: 10.4616 LR: 0.000300 +12/09/2025 12:40:35 - INFO - __main__ - Step: 1310 Loss: 10.4222 LR: 0.000300 +12/09/2025 12:41:36 - INFO - __main__ - Step: 1320 Loss: 10.4237 LR: 0.000300 +12/09/2025 12:42:36 - INFO - __main__ - Step: 1330 Loss: 10.4675 LR: 0.000300 +12/09/2025 12:43:38 - INFO - __main__ - Step: 1340 Loss: 10.4428 LR: 0.000300 +12/09/2025 12:44:37 - INFO - __main__ - Step: 1350 Loss: 10.3288 LR: 0.000300 +12/09/2025 12:45:37 - INFO - __main__ - Step: 1360 Loss: 10.4652 LR: 0.000300 +12/09/2025 12:46:37 - INFO - __main__ - Step: 1370 Loss: 10.2860 LR: 0.000300 +12/09/2025 12:47:38 - INFO - __main__ - Step: 1380 Loss: 10.4438 LR: 0.000300 +12/09/2025 12:48:38 - INFO - __main__ - Step: 1390 Loss: 10.2577 LR: 0.000300 +12/09/2025 12:49:36 - INFO - __main__ - Step: 1400 Loss: 10.4907 LR: 0.000300 +12/09/2025 12:50:33 - INFO - __main__ - Step: 1410 Loss: 10.4241 LR: 0.000300 +12/09/2025 12:51:32 - INFO - __main__ - Step: 1420 Loss: 10.4317 LR: 0.000300 +12/09/2025 12:52:31 - INFO - __main__ - Step: 1430 Loss: 10.4626 LR: 0.000300 +12/09/2025 12:53:29 - INFO - __main__ - Step: 1440 Loss: 9.9582 LR: 0.000300 +12/09/2025 12:54:25 - INFO - __main__ - Step: 1450 Loss: 10.4416 LR: 0.000300 +12/09/2025 12:55:20 - INFO - __main__ - Step: 1460 Loss: 10.4264 LR: 0.000300 +12/09/2025 12:56:15 - INFO - __main__ - Step: 1470 Loss: 10.4191 LR: 0.000300 +12/09/2025 12:57:12 - INFO - __main__ - Step: 1480 Loss: 10.4106 LR: 0.000300 +12/09/2025 12:58:08 - INFO - __main__ - Step: 1490 Loss: 10.4202 LR: 0.000300 +12/09/2025 12:59:05 - INFO - __main__ - Step: 1500 Loss: 10.3524 LR: 0.000300 +12/09/2025 12:59:05 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500 +12/09/2025 12:59:14 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/optimizer.bin +12/09/2025 12:59:14 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/scheduler.bin +12/09/2025 12:59:14 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/sampler.bin +12/09/2025 12:59:14 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500/random_states_0.pkl +12/09/2025 12:59:14 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue/checkpoint-1500 +12/09/2025 12:59:14 - INFO - __main__ - Generating videos for validation... +12/09/2025 12:59:14 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.85it/s] +12/09/2025 12:59:21 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue +12/09/2025 13:00:21 - INFO - __main__ - Step: 1510 Loss: 10.4142 LR: 0.000300 +12/09/2025 13:01:20 - INFO - __main__ - Step: 1520 Loss: 10.3434 LR: 0.000300 +12/09/2025 13:02:21 - INFO - __main__ - Step: 1530 Loss: 10.3446 LR: 0.000300 +12/09/2025 13:03:20 - INFO - __main__ - Step: 1540 Loss: 10.2433 LR: 0.000300 +12/09/2025 13:04:20 - INFO - __main__ - Step: 1550 Loss: 10.4155 LR: 0.000300 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1333, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1333, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/requirements.txt b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..34608b872588695b6b3f38c7194f68f73d071956 --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/files/wandb-metadata.json @@ -0,0 +1,149 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T10:26:51.247187Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "8", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12040636624896" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "hsmu5rjpjbgh0nwq7xjadpomdbmu3xn2" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-core.log b/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..89b2bd21bb34cfc83b9539dd7386fb9855fe0e6e --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T10:26:51.315145192Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpmv5pwgec/port-1814887.txt","pid":1814887,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T10:26:51.315676461Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1814887} +{"time":"2025-12-09T10:26:51.315686707Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1814887-1815159-3368749728/socket","Net":"unix"}} +{"time":"2025-12-09T10:26:51.501584188Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T10:26:51.507573641Z","level":"INFO","msg":"handleInformInit: received","streamId":"55o5soqg","id":"1(@)"} +{"time":"2025-12-09T10:26:51.697675637Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"55o5soqg","id":"1(@)"} +{"time":"2025-12-09T13:05:17.894692268Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-internal.log b/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ae66f536156494f6b2ddf0b1cdd0561c04fb4ccc --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T10:26:51.507674532Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T10:26:51.697475367Z","level":"INFO","msg":"stream: created new stream","id":"55o5soqg"} +{"time":"2025-12-09T10:26:51.697546226Z","level":"INFO","msg":"handler: started","stream_id":"55o5soqg"} +{"time":"2025-12-09T10:26:51.697669497Z","level":"INFO","msg":"stream: started","id":"55o5soqg"} +{"time":"2025-12-09T10:26:51.697687428Z","level":"INFO","msg":"sender: started","stream_id":"55o5soqg"} +{"time":"2025-12-09T10:26:51.697686518Z","level":"INFO","msg":"writer: started","stream_id":"55o5soqg"} diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug.log b/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..c528c5b2fa4eadfcdcf0a783845ba08938d1bf8f --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_setup.py:_flush():80] Configure stats pid to 1814887 +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug.log +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_102651-55o5soqg/logs/debug-internal.log +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_init.py:init():841] calling init triggers +2025-12-09 10:26:51,249 INFO MainThread:1814887 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 10:26:51,250 INFO MainThread:1814887 [wandb_init.py:init():889] starting backend +2025-12-09 10:26:51,501 INFO MainThread:1814887 [wandb_init.py:init():892] sending inform_init request +2025-12-09 10:26:51,506 INFO MainThread:1814887 [wandb_init.py:init():900] backend started and connected +2025-12-09 10:26:51,507 INFO MainThread:1814887 [wandb_init.py:init():970] updated telemetry +2025-12-09 10:26:51,511 INFO MainThread:1814887 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 10:26:52,078 INFO MainThread:1814887 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 10:26:52,203 INFO MainThread:1814887 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 10:26:52,203 INFO MainThread:1814887 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 10:26:52,203 INFO MainThread:1814887 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 10:26:52,203 INFO MainThread:1814887 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 10:26:52,206 INFO MainThread:1814887 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 10:26:52,207 INFO MainThread:1814887 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': False, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': None, 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251209_102651-55o5soqg/run-55o5soqg.wandb b/Meissonic/wandb/run-20251209_102651-55o5soqg/run-55o5soqg.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a5f09ea7316d40c41ea7b4ab9e7a6454e9cb2fde --- /dev/null +++ b/Meissonic/wandb/run-20251209_102651-55o5soqg/run-55o5soqg.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2425b0a0a1225df970143645fbb05621ecc43f561be094223de12c44897e1caf +size 2392064 diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/files/config.yaml b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3691389a26af5dbc4ce4c901951467b8924280fe --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/config.yaml @@ -0,0 +1,297 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + bldt1h2u4zo6vc79xvm2e3ynt6e3tebc: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --pretrained_model_name_or_path + - /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "8" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093761687552" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T13:50:22.825258Z" + writerId: bldt1h2u4zo6vc79xvm2e3ynt6e3tebc + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 8 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: false +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: null diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/files/output.log b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a85066a6de9c97e299c0b71d51818b320dc24c12 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/output.log @@ -0,0 +1,54 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 68.30it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 13:50:26 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +12/09/2025 13:50:26 - INFO - __main__ - Using precomputed features - video tokenizer will only be used for mask_token_id +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5728.81it/s] +12/09/2025 13:50:32 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 13:50:32 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 13:50:32 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 13:50:32 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 13:50:32 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 13:50:32 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 13:50:32 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 13:50:32 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 13:50:32 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 13:50:32 - INFO - __main__ - Dataloader configuration: +12/09/2025 13:50:32 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 13:50:32 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 13:50:32 - INFO - __main__ - - persistent_workers: True +12/09/2025 13:50:32 - INFO - __main__ - - pin_memory: True +12/09/2025 13:50:32 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 13:50:48 - INFO - __main__ - ***** Running training ***** +12/09/2025 13:50:48 - INFO - __main__ - Num training steps = 10000 +12/09/2025 13:50:48 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 13:50:48 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 13:50:48 - INFO - __main__ - Gradient Accumulation steps = 8 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1322, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.58 GiB. GPU 0 has a total capacity of 39.49 GiB of which 4.18 GiB is free. Process 42760 has 414.00 MiB memory in use. Process 42757 has 414.00 MiB memory in use. Process 42758 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 32.43 GiB memory in use. Process 42759 has 414.00 MiB memory in use. Process 42754 has 414.00 MiB memory in use. Process 42755 has 414.00 MiB memory in use. Process 42756 has 414.00 MiB memory in use. Of the allocated memory 31.07 GiB is allocated by PyTorch, and 274.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1485, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1322, in main +[rank0]: loss = F.cross_entropy( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.58 GiB. GPU 0 has a total capacity of 39.49 GiB of which 4.18 GiB is free. Process 42760 has 414.00 MiB memory in use. Process 42757 has 414.00 MiB memory in use. Process 42758 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 32.43 GiB memory in use. Process 42759 has 414.00 MiB memory in use. Process 42754 has 414.00 MiB memory in use. Process 42755 has 414.00 MiB memory in use. Process 42756 has 414.00 MiB memory in use. Of the allocated memory 31.07 GiB is allocated by PyTorch, and 274.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1648, in _clean_up_worker + w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/process.py", line 149, in join + res = self._popen.wait(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait + if not wait([self.sentinel], timeout): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt: diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/files/requirements.txt b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..77df7e2b6713a804b3c8382827a181a22bab0525 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/wandb-metadata.json @@ -0,0 +1,152 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T13:50:22.825258Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "8", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093761687552" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "bldt1h2u4zo6vc79xvm2e3ynt6e3tebc" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/files/wandb-summary.json b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..978ee889fedb890efdfac2b1684d5a836afcc2b0 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":54},"_runtime":54} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-core.log b/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..56473134125ed9b6925e646ca947f55302e249ad --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2025-12-09T13:50:22.900042475Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpao4xz0yf/port-42753.txt","pid":42753,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T13:50:22.900572652Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":42753} +{"time":"2025-12-09T13:50:22.90058245Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-42753-43005-1356035954/socket","Net":"unix"}} +{"time":"2025-12-09T13:50:23.081102199Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T13:50:23.08910113Z","level":"INFO","msg":"handleInformInit: received","streamId":"dl453cfo","id":"1(@)"} +{"time":"2025-12-09T13:50:23.255636669Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"dl453cfo","id":"1(@)"} +{"time":"2025-12-09T13:51:18.281641871Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T13:51:18.281712181Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T13:51:18.281774707Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T13:51:18.281728224Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T13:51:18.28204532Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-42753-43005-1356035954/socket","Net":"unix"}} +{"time":"2025-12-09T13:51:18.516370005Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-internal.log b/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5694067477fda531a59d251bb2202cd7520df961 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T13:50:23.089330458Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T13:50:23.255438447Z","level":"INFO","msg":"stream: created new stream","id":"dl453cfo"} +{"time":"2025-12-09T13:50:23.255518205Z","level":"INFO","msg":"handler: started","stream_id":"dl453cfo"} +{"time":"2025-12-09T13:50:23.2556299Z","level":"INFO","msg":"stream: started","id":"dl453cfo"} +{"time":"2025-12-09T13:50:23.255647387Z","level":"INFO","msg":"writer: started","stream_id":"dl453cfo"} +{"time":"2025-12-09T13:50:23.255650645Z","level":"INFO","msg":"sender: started","stream_id":"dl453cfo"} +{"time":"2025-12-09T13:51:18.281710591Z","level":"INFO","msg":"stream: closing","id":"dl453cfo"} diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug.log b/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5838a332244f3e2820ca78a07ca1c64f48c0c009 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 13:50:22,828 INFO MainThread:42753 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 13:50:22,828 INFO MainThread:42753 [wandb_setup.py:_flush():80] Configure stats pid to 42753 +2025-12-09 13:50:22,828 INFO MainThread:42753 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 13:50:22,828 INFO MainThread:42753 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 13:50:22,829 INFO MainThread:42753 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 13:50:22,829 INFO MainThread:42753 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug.log +2025-12-09 13:50:22,829 INFO MainThread:42753 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_135022-dl453cfo/logs/debug-internal.log +2025-12-09 13:50:22,829 INFO MainThread:42753 [wandb_init.py:init():841] calling init triggers +2025-12-09 13:50:22,829 INFO MainThread:42753 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 13:50:22,829 INFO MainThread:42753 [wandb_init.py:init():889] starting backend +2025-12-09 13:50:23,081 INFO MainThread:42753 [wandb_init.py:init():892] sending inform_init request +2025-12-09 13:50:23,087 INFO MainThread:42753 [wandb_init.py:init():900] backend started and connected +2025-12-09 13:50:23,088 INFO MainThread:42753 [wandb_init.py:init():970] updated telemetry +2025-12-09 13:50:23,093 INFO MainThread:42753 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 13:50:23,595 INFO MainThread:42753 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 13:50:23,719 INFO MainThread:42753 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 13:50:23,719 INFO MainThread:42753 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 13:50:23,720 INFO MainThread:42753 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 13:50:23,720 INFO MainThread:42753 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 13:50:23,722 INFO MainThread:42753 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 13:50:23,723 INFO MainThread:42753 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': False, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': None, 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 13:51:18,281 INFO wandb-AsyncioManager-main:42753 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 13:51:18,281 INFO wandb-AsyncioManager-main:42753 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_135022-dl453cfo/run-dl453cfo.wandb b/Meissonic/wandb/run-20251209_135022-dl453cfo/run-dl453cfo.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/Meissonic/wandb/run-20251209_135022-dl453cfo/run-dl453cfo.wandb differ diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/config.yaml b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..202132cf41af2c7ba127f7a9237a8f5d729945e3 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/config.yaml @@ -0,0 +1,297 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + me1mlu82n7mm3lyja37emytxtqqfliqv: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --pretrained_model_name_or_path + - /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "8" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093761908736" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T13:56:38.527057Z" + writerId: me1mlu82n7mm3lyja37emytxtqqfliqv + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 8 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: null diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/output.log b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e50157ddbcaeb13d14e24dc750d677404541fd9c --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/output.log @@ -0,0 +1,18 @@ +12/09/2025 13:56:39 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 13:56:39 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 13:56:39 - INFO - __main__ - Loading minimal video tokenizer config to get mask_token_id and codebook_size... +12/09/2025 13:56:39 - INFO - __main__ - Loading video tokenizer temporarily to get mask_token_id and codebook_size... +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4950.28it/s] +12/09/2025 13:56:40 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1587, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 609, in main + dummy_tokens = video_tokenizer.encode(dummy_video) # [1, F', H', W'] +AttributeError: 'MinimalTokenizer' object has no attribute 'encode' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1587, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 609, in main +[rank0]: dummy_tokens = video_tokenizer.encode(dummy_video) # [1, F', H', W'] +[rank0]: AttributeError: 'MinimalTokenizer' object has no attribute 'encode' diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/requirements.txt b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6452039bc89e441abff9075489ceb1b93dec85c5 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/wandb-metadata.json @@ -0,0 +1,152 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T13:56:38.527057Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "8", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093761908736" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "me1mlu82n7mm3lyja37emytxtqqfliqv" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/wandb-summary.json b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a620d0c1047a4dd8a400939b6da246ed8063a7 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0},"_runtime":0} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-core.log b/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..7debe050b38fd74f80ee40630043fc418f23b372 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T13:56:38.595965875Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp6xfqqjho/port-48464.txt","pid":48464,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T13:56:38.596570264Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":48464} +{"time":"2025-12-09T13:56:38.596573389Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-48464-48765-3576475713/socket","Net":"unix"}} +{"time":"2025-12-09T13:56:38.782286222Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T13:56:38.788282902Z","level":"INFO","msg":"handleInformInit: received","streamId":"lmzlczvn","id":"1(@)"} +{"time":"2025-12-09T13:56:39.557589177Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"lmzlczvn","id":"1(@)"} +{"time":"2025-12-09T13:56:40.672527157Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T13:56:40.67257452Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T13:56:40.672567954Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T13:56:40.672615099Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T13:56:40.672642951Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-48464-48765-3576475713/socket","Net":"unix"}} +{"time":"2025-12-09T13:56:41.19927464Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T13:56:41.199306855Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T13:56:41.199316754Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-internal.log b/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..808ee64021b9ea38bfb842421bcb8f5ddd1fb637 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T13:56:38.788417234Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T13:56:39.557024041Z","level":"INFO","msg":"stream: created new stream","id":"lmzlczvn"} +{"time":"2025-12-09T13:56:39.557152083Z","level":"INFO","msg":"handler: started","stream_id":"lmzlczvn"} +{"time":"2025-12-09T13:56:39.557575502Z","level":"INFO","msg":"stream: started","id":"lmzlczvn"} +{"time":"2025-12-09T13:56:39.557588441Z","level":"INFO","msg":"writer: started","stream_id":"lmzlczvn"} +{"time":"2025-12-09T13:56:39.557588566Z","level":"INFO","msg":"sender: started","stream_id":"lmzlczvn"} +{"time":"2025-12-09T13:56:40.672577239Z","level":"INFO","msg":"stream: closing","id":"lmzlczvn"} +{"time":"2025-12-09T13:56:41.080716886Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T13:56:41.195394384Z","level":"INFO","msg":"handler: closed","stream_id":"lmzlczvn"} +{"time":"2025-12-09T13:56:41.196096167Z","level":"INFO","msg":"sender: closed","stream_id":"lmzlczvn"} +{"time":"2025-12-09T13:56:41.196108619Z","level":"INFO","msg":"stream: closed","id":"lmzlczvn"} diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug.log b/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5cbd0df5230c04811c709dbf6950288b833d8201 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 13:56:38,529 INFO MainThread:48464 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 13:56:38,529 INFO MainThread:48464 [wandb_setup.py:_flush():80] Configure stats pid to 48464 +2025-12-09 13:56:38,529 INFO MainThread:48464 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 13:56:38,529 INFO MainThread:48464 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 13:56:38,530 INFO MainThread:48464 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 13:56:38,530 INFO MainThread:48464 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug.log +2025-12-09 13:56:38,530 INFO MainThread:48464 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_135638-lmzlczvn/logs/debug-internal.log +2025-12-09 13:56:38,530 INFO MainThread:48464 [wandb_init.py:init():841] calling init triggers +2025-12-09 13:56:38,530 INFO MainThread:48464 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 13:56:38,530 INFO MainThread:48464 [wandb_init.py:init():889] starting backend +2025-12-09 13:56:38,782 INFO MainThread:48464 [wandb_init.py:init():892] sending inform_init request +2025-12-09 13:56:38,786 INFO MainThread:48464 [wandb_init.py:init():900] backend started and connected +2025-12-09 13:56:38,787 INFO MainThread:48464 [wandb_init.py:init():970] updated telemetry +2025-12-09 13:56:38,792 INFO MainThread:48464 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 13:56:39,777 INFO MainThread:48464 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 13:56:39,900 INFO MainThread:48464 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 13:56:39,900 INFO MainThread:48464 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 13:56:39,900 INFO MainThread:48464 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 13:56:39,900 INFO MainThread:48464 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 13:56:39,903 INFO MainThread:48464 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 13:56:39,904 INFO MainThread:48464 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': None, 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 13:56:40,672 INFO wandb-AsyncioManager-main:48464 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 13:56:40,672 INFO wandb-AsyncioManager-main:48464 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_135638-lmzlczvn/run-lmzlczvn.wandb b/Meissonic/wandb/run-20251209_135638-lmzlczvn/run-lmzlczvn.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9e62e7d6e3f702d179c67d6a7e9c08a48fb037d2 Binary files /dev/null and b/Meissonic/wandb/run-20251209_135638-lmzlczvn/run-lmzlczvn.wandb differ diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/files/config.yaml b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..418f911e212549fad9b75d6be2c1cec17ef8250e --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/config.yaml @@ -0,0 +1,297 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 4c3apb56ykx61de6cbks2kgguigh72w8: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --pretrained_model_name_or_path + - /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "8" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093762088960" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T13:59:02.075119Z" + writerId: 4c3apb56ykx61de6cbks2kgguigh72w8 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 8 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: /mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000 +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: null diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/files/output.log b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..85d46cba282205fb8c9c2eb1822df345b1c534f5 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/output.log @@ -0,0 +1,60 @@ +12/09/2025 13:59:03 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 13:59:03 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 13:59:03 - INFO - __main__ - Loading minimal video tokenizer config to get mask_token_id and codebook_size... +12/09/2025 13:59:03 - INFO - __main__ - Loading video tokenizer temporarily to get mask_token_id and codebook_size... +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5598.80it/s] +12/09/2025 13:59:04 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 13:59:04 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 13:59:04 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 13:59:04 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 13:59:34 - INFO - __main__ - Parameter counts: backbone=2,191,878,208, other=2,112,033, total=2,193,990,241 +12/09/2025 13:59:34 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 13:59:34 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 13:59:34 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 13:59:34 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 13:59:34 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 13:59:34 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 13:59:34 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 13:59:34 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 13:59:34 - INFO - __main__ - Dataloader configuration: +12/09/2025 13:59:34 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 13:59:34 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 13:59:34 - INFO - __main__ - - persistent_workers: True +12/09/2025 13:59:34 - INFO - __main__ - - pin_memory: True +12/09/2025 13:59:34 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 13:59:36 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 13:59:36 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 13:59:36 - INFO - __main__ - ***** Running training ***** +12/09/2025 13:59:36 - INFO - __main__ - Num training steps = 10000 +12/09/2025 13:59:36 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 13:59:36 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 13:59:36 - INFO - __main__ - Gradient Accumulation steps = 8 +12/09/2025 13:59:37 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1675, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1492, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.58 GiB. GPU 0 has a total capacity of 39.49 GiB of which 3.84 GiB is free. Process 50348 has 414.00 MiB memory in use. Process 50344 has 414.00 MiB memory in use. Process 50345 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 32.77 GiB memory in use. Process 50349 has 414.00 MiB memory in use. Process 50347 has 414.00 MiB memory in use. Process 50350 has 414.00 MiB memory in use. Process 50346 has 414.00 MiB memory in use. Of the allocated memory 30.94 GiB is allocated by PyTorch, and 753.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1675, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1492, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.58 GiB. GPU 0 has a total capacity of 39.49 GiB of which 3.84 GiB is free. Process 50348 has 414.00 MiB memory in use. Process 50344 has 414.00 MiB memory in use. Process 50345 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 32.77 GiB memory in use. Process 50349 has 414.00 MiB memory in use. Process 50347 has 414.00 MiB memory in use. Process 50350 has 414.00 MiB memory in use. Process 50346 has 414.00 MiB memory in use. Of the allocated memory 30.94 GiB is allocated by PyTorch, and 753.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/files/requirements.txt b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..38f593a42902dd4775e0362819f414cf7ca54e5f --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/wandb-metadata.json @@ -0,0 +1,152 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T13:59:02.075119Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--pretrained_model_name_or_path", + "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "8", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093762088960" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "4c3apb56ykx61de6cbks2kgguigh72w8" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/files/wandb-summary.json b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..48f7d9a062ccb1f6a06780718d54c131168a079a --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":77},"_runtime":77} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-core.log b/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..bc12488022a3371df43b6280e3d8cd0bf84f6715 --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T13:59:02.143839344Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpzs4b0rhb/port-50343.txt","pid":50343,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T13:59:02.144318261Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":50343} +{"time":"2025-12-09T13:59:02.144330176Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-50343-50653-3284053146/socket","Net":"unix"}} +{"time":"2025-12-09T13:59:02.330489605Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T13:59:02.335981868Z","level":"INFO","msg":"handleInformInit: received","streamId":"iktd65kq","id":"1(@)"} +{"time":"2025-12-09T13:59:03.106386993Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"iktd65kq","id":"1(@)"} +{"time":"2025-12-09T14:00:20.892712387Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T14:00:20.892825862Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T14:00:20.892814995Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T14:00:20.892953913Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T14:00:20.892964195Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-50343-50653-3284053146/socket","Net":"unix"}} +{"time":"2025-12-09T14:00:21.260765955Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T14:00:21.26078412Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T14:00:21.260792765Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-internal.log b/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2cc97844edd328fc54fa01bda864f764035ad9ac --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T13:59:02.336109648Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T13:59:03.106167184Z","level":"INFO","msg":"stream: created new stream","id":"iktd65kq"} +{"time":"2025-12-09T13:59:03.106257767Z","level":"INFO","msg":"handler: started","stream_id":"iktd65kq"} +{"time":"2025-12-09T13:59:03.106377335Z","level":"INFO","msg":"stream: started","id":"iktd65kq"} +{"time":"2025-12-09T13:59:03.106393438Z","level":"INFO","msg":"writer: started","stream_id":"iktd65kq"} +{"time":"2025-12-09T13:59:03.106402792Z","level":"INFO","msg":"sender: started","stream_id":"iktd65kq"} +{"time":"2025-12-09T14:00:20.892812789Z","level":"INFO","msg":"stream: closing","id":"iktd65kq"} +{"time":"2025-12-09T14:00:21.165060913Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T14:00:21.257647014Z","level":"INFO","msg":"handler: closed","stream_id":"iktd65kq"} +{"time":"2025-12-09T14:00:21.257734093Z","level":"INFO","msg":"sender: closed","stream_id":"iktd65kq"} +{"time":"2025-12-09T14:00:21.257740775Z","level":"INFO","msg":"stream: closed","id":"iktd65kq"} diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug.log b/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..96a4d9ff8dd6c01db6eae94c966a874c69ddd64a --- /dev/null +++ b/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_setup.py:_flush():80] Configure stats pid to 50343 +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug.log +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_135902-iktd65kq/logs/debug-internal.log +2025-12-09 13:59:02,077 INFO MainThread:50343 [wandb_init.py:init():841] calling init triggers +2025-12-09 13:59:02,078 INFO MainThread:50343 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 13:59:02,078 INFO MainThread:50343 [wandb_init.py:init():889] starting backend +2025-12-09 13:59:02,330 INFO MainThread:50343 [wandb_init.py:init():892] sending inform_init request +2025-12-09 13:59:02,334 INFO MainThread:50343 [wandb_init.py:init():900] backend started and connected +2025-12-09 13:59:02,335 INFO MainThread:50343 [wandb_init.py:init():970] updated telemetry +2025-12-09 13:59:02,339 INFO MainThread:50343 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 13:59:03,425 INFO MainThread:50343 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 13:59:03,550 INFO MainThread:50343 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 13:59:03,551 INFO MainThread:50343 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 13:59:03,551 INFO MainThread:50343 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 13:59:03,551 INFO MainThread:50343 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 13:59:03,554 INFO MainThread:50343 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 13:59:03,555 INFO MainThread:50343 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': '/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': None, 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 14:00:20,892 INFO wandb-AsyncioManager-main:50343 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 14:00:20,893 INFO wandb-AsyncioManager-main:50343 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_135902-iktd65kq/run-iktd65kq.wandb b/Meissonic/wandb/run-20251209_135902-iktd65kq/run-iktd65kq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d639d1bdce2204e33d6b4debf3b098896874335a Binary files /dev/null and b/Meissonic/wandb/run-20251209_135902-iktd65kq/run-iktd65kq.wandb differ diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/files/config.yaml b/Meissonic/wandb/run-20251209_140322-otx47txf/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c0af9e5372997e0b0c7e6b4de22e023385b476f0 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/files/config.yaml @@ -0,0 +1,245 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + p6u11ug9znfnhceiv1d6rhmfrzlq1mla: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093762334720" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T14:03:22.803470Z" + writerId: p6u11ug9znfnhceiv1d6rhmfrzlq1mla + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: false +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0 +dataloader_num_workers: + value: 4 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: null +instance_data_image: + value: null +instance_dataset: + value: null +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: null +min_masking_rate: + value: 0 +mixed_precision: + value: null +num_frames: + value: 16 +output_dir: + value: muse_training +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: null +split_vae_encode: + value: null +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 16 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: false +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: null +validation_steps: + value: 100 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_backbone_lr_ratio: + value: 0.1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/files/output.log b/Meissonic/wandb/run-20251209_140322-otx47txf/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8b33dfa6a4eaca808318f8960994a28e957e3458 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/files/output.log @@ -0,0 +1,40 @@ +12/09/2025 14:03:24 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 14:03:24 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 14:03:24 - INFO - __main__ - Loading minimal video tokenizer config to get mask_token_id and codebook_size... +12/09/2025 14:03:24 - INFO - __main__ - Loading video tokenizer temporarily to get mask_token_id and codebook_size... +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9731.56it/s] +12/09/2025 14:03:25 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 14:03:25 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 14:03:25 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 14:03:25 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 14:03:25 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:03:25 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 14:03:40 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:03:40 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:03:43 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 14:03:43 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 14:03:43 - INFO - __main__ - Wan backbone lr = 0.000030 (base_lr * 0.1) +12/09/2025 14:03:43 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 14:03:43 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 14:03:43 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 14:03:43 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 14:03:43 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 14:03:43 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 14:03:43 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 14:03:43 - INFO - __main__ - Dataloader configuration: +12/09/2025 14:03:43 - INFO - __main__ - - num_workers: 4 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 14:03:43 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 14:03:43 - INFO - __main__ - - persistent_workers: True +12/09/2025 14:03:43 - INFO - __main__ - - pin_memory: True +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1675, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1255, in main + num_training_steps=args.max_train_steps * accelerator.num_processes, +TypeError: unsupported operand type(s) for *: 'NoneType' and 'int' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1675, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1255, in main +[rank0]: num_training_steps=args.max_train_steps * accelerator.num_processes, +[rank0]: TypeError: unsupported operand type(s) for *: 'NoneType' and 'int' diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/files/requirements.txt b/Meissonic/wandb/run-20251209_140322-otx47txf/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_140322-otx47txf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5b79eb8ecbf1f91854938d44f842652ced559903 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/files/wandb-metadata.json @@ -0,0 +1,102 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T14:03:22.803470Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093762334720" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "p6u11ug9znfnhceiv1d6rhmfrzlq1mla" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/files/wandb-summary.json b/Meissonic/wandb/run-20251209_140322-otx47txf/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ba532d630a2f6e9086432ef98a4e1304e8ad3f55 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":19,"_wandb":{"runtime":19}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-core.log b/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..526c4ea90ec4e80077185b540c003af4257001db --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T14:03:22.872698559Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpylheuc_2/port-55260.txt","pid":55260,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T14:03:22.873176264Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":55260} +{"time":"2025-12-09T14:03:22.873179108Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-55260-55565-1676773212/socket","Net":"unix"}} +{"time":"2025-12-09T14:03:23.059133915Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T14:03:23.065070162Z","level":"INFO","msg":"handleInformInit: received","streamId":"otx47txf","id":"1(@)"} +{"time":"2025-12-09T14:03:23.735180283Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"otx47txf","id":"1(@)"} +{"time":"2025-12-09T14:03:43.812586417Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T14:03:43.812651667Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T14:03:43.81269649Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T14:03:43.812663556Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T14:03:43.812794162Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-55260-55565-1676773212/socket","Net":"unix"}} +{"time":"2025-12-09T14:03:44.315715108Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T14:03:44.315733252Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T14:03:44.315742409Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-internal.log b/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3addcc7bfa6cdfe8b1e0e416ef71641a7de1a787 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T14:03:23.065199508Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T14:03:23.734948808Z","level":"INFO","msg":"stream: created new stream","id":"otx47txf"} +{"time":"2025-12-09T14:03:23.735040885Z","level":"INFO","msg":"handler: started","stream_id":"otx47txf"} +{"time":"2025-12-09T14:03:23.735172493Z","level":"INFO","msg":"stream: started","id":"otx47txf"} +{"time":"2025-12-09T14:03:23.735189016Z","level":"INFO","msg":"writer: started","stream_id":"otx47txf"} +{"time":"2025-12-09T14:03:23.735215661Z","level":"INFO","msg":"sender: started","stream_id":"otx47txf"} +{"time":"2025-12-09T14:03:43.812647879Z","level":"INFO","msg":"stream: closing","id":"otx47txf"} +{"time":"2025-12-09T14:03:44.162140778Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T14:03:44.312677483Z","level":"INFO","msg":"handler: closed","stream_id":"otx47txf"} +{"time":"2025-12-09T14:03:44.312772008Z","level":"INFO","msg":"sender: closed","stream_id":"otx47txf"} +{"time":"2025-12-09T14:03:44.312779439Z","level":"INFO","msg":"stream: closed","id":"otx47txf"} diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug.log b/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8832f2b526253fccac140c2bf52a432b3fb5845b --- /dev/null +++ b/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_setup.py:_flush():80] Configure stats pid to 55260 +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug.log +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_140322-otx47txf/logs/debug-internal.log +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_init.py:init():841] calling init triggers +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 14:03:22,806 INFO MainThread:55260 [wandb_init.py:init():889] starting backend +2025-12-09 14:03:23,059 INFO MainThread:55260 [wandb_init.py:init():892] sending inform_init request +2025-12-09 14:03:23,063 INFO MainThread:55260 [wandb_init.py:init():900] backend started and connected +2025-12-09 14:03:23,064 INFO MainThread:55260 [wandb_init.py:init():970] updated telemetry +2025-12-09 14:03:23,069 INFO MainThread:55260 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 14:03:24,156 INFO MainThread:55260 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 14:03:24,279 INFO MainThread:55260 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 14:03:24,280 INFO MainThread:55260 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 14:03:24,280 INFO MainThread:55260 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 14:03:24,280 INFO MainThread:55260 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 14:03:24,282 INFO MainThread:55260 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 14:03:24,283 INFO MainThread:55260 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': None, 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': None, 'instance_data_image': None, 'use_8bit_adam': False, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': 2, 'allow_tf32': False, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': 'muse_training', 'seed': None, 'logging_dir': 'logs', 'max_train_steps': None, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 100, 'mixed_precision': None, 'report_to': 'wandb', 'validation_prompts': None, 'resolution': 512, 'split_vae_encode': None, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': False, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.1, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 14:03:43,812 INFO wandb-AsyncioManager-main:55260 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 14:03:43,812 INFO wandb-AsyncioManager-main:55260 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_140322-otx47txf/run-otx47txf.wandb b/Meissonic/wandb/run-20251209_140322-otx47txf/run-otx47txf.wandb new file mode 100644 index 0000000000000000000000000000000000000000..612540832748e52d1bd1fe3b6b0c9fd7c5410813 Binary files /dev/null and b/Meissonic/wandb/run-20251209_140322-otx47txf/run-otx47txf.wandb differ diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/config.yaml b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0576510c9410a5424ae3c5268c540d2d0943ace3 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/config.yaml @@ -0,0 +1,245 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + cau9z7jh4phx1m0wq4r3ub8xd0sumfyz: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093762543616" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T14:08:07.626312Z" + writerId: cau9z7jh4phx1m0wq4r3ub8xd0sumfyz + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: false +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0 +dataloader_num_workers: + value: 4 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: null +instance_data_image: + value: null +instance_dataset: + value: null +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: null +min_masking_rate: + value: 0 +mixed_precision: + value: null +num_frames: + value: 16 +output_dir: + value: muse_training +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: null +split_vae_encode: + value: null +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 16 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: false +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: null +validation_steps: + value: 100 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_backbone_lr_ratio: + value: 0.1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/output.log b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..185a699b5650a82a33df26db911f0279ca01eaae --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/output.log @@ -0,0 +1,278 @@ +12/09/2025 14:08:08 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 14:08:08 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 14:08:08 - INFO - __main__ - Loading minimal video tokenizer config to get mask_token_id and codebook_size... +12/09/2025 14:08:08 - INFO - __main__ - Loading video tokenizer temporarily to get mask_token_id and codebook_size... +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9721.90it/s] +12/09/2025 14:08:09 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 14:08:09 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 14:08:09 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 14:08:09 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 14:08:09 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:08:09 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 14:08:25 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:08:25 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:08:27 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 14:08:27 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 14:08:27 - INFO - __main__ - Wan backbone lr = 0.000030 (base_lr * 0.1) +12/09/2025 14:08:27 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 14:08:27 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 14:08:27 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 14:08:27 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 14:08:27 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 14:08:27 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 14:08:27 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 14:08:27 - INFO - __main__ - Dataloader configuration: +12/09/2025 14:08:27 - INFO - __main__ - - num_workers: 4 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 14:08:27 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 14:08:27 - INFO - __main__ - - persistent_workers: True +12/09/2025 14:08:27 - INFO - __main__ - - pin_memory: True +12/09/2025 14:08:27 - WARNING - __main__ - max_train_steps not specified, defaulting to 1 epoch (8 steps) +12/09/2025 14:08:27 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 14:08:30 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 14:08:30 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 14:08:30 - INFO - __main__ - ***** Running training ***** +12/09/2025 14:08:30 - INFO - __main__ - Num training steps = 8 +12/09/2025 14:08:30 - INFO - __main__ - Instantaneous batch size per device = 16 +12/09/2025 14:08:30 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/09/2025 14:08:30 - INFO - __main__ - Gradient Accumulation steps = 1 +/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:312: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance. + warnings.warn( +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1685, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1466, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1026, in forward + out_list = self.backbone( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in torch_dynamo_resume_in_forward_at_471 + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped + return self._wrapped_call(self, *args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ + raise e + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ + return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File ".140", line 33, in forward + submod_1 = self.compiled_submod_1(l_context_, l_self_modules_cross_attn_modules_k_parameters_weight_, l_self_modules_cross_attn_modules_k_parameters_bias_, l_self_modules_cross_attn_modules_norm_k_parameters_weight_, l_self_modules_cross_attn_modules_v_parameters_weight_, l_self_modules_cross_attn_modules_v_parameters_bias_, getitem, l_self_modules_cross_attn_modules_o_parameters_weight_, l_self_modules_cross_attn_modules_o_parameters_bias_, getitem_1, l_e_3_, l_e_4_, l_e_5_); l_context_ = l_self_modules_cross_attn_modules_k_parameters_weight_ = l_self_modules_cross_attn_modules_k_parameters_bias_ = l_self_modules_cross_attn_modules_norm_k_parameters_weight_ = l_self_modules_cross_attn_modules_v_parameters_weight_ = l_self_modules_cross_attn_modules_v_parameters_bias_ = getitem = l_self_modules_cross_attn_modules_o_parameters_weight_ = l_self_modules_cross_attn_modules_o_parameters_bias_ = getitem_1 = l_e_3_ = l_e_4_ = l_e_5_ = None + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/backends/distributed.py", line 213, in forward + x = self.submod(*args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward + return compiled_fn(full_args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 339, in runtime_wrapper + all_outs = call_func_at_runtime_with_args( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args + out = normalize_as_list(f(args)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 103, in g + return f(*args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/function.py", line 581, in apply + return super().apply(*args, **kwargs) # type: ignore[misc] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2118, in forward + fw_outs = call_func_at_runtime_with_args( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args + out = normalize_as_list(f(args)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 629, in wrapper + return compiled_fn(runtime_args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper + return compiled_fn(runtime_args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn + outs = compiled_fn(args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ + return self.current_callable(inputs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3017, in run + out = model(new_inputs) + File "/opt/dlami/nvme/tmp_user/torchinductor_ubuntu/i4/ci4fv7peoif3y74vjxhcaw2jns2xsolgu3lq4o3qolhl2zs34f5j.py", line 543, in call + triton_red_fused_add_addmm_mul_native_layer_norm_squeeze_view_5.run(buf16, buf20, primals_10, primals_9, primals_12, primals_11, buf17, buf21, 127200, 1536, stream=stream0) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1272, in run + self.autotune_to_one_config(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1048, in autotune_to_one_config + timings = self.benchmark_all_configs(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1022, in benchmark_all_configs + timings = { + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1023, in + launcher: self.bench(launcher, *args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 891, in bench + return benchmarker.benchmark_gpu(kernel_call, rep=40) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper + return fn(self, *args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 250, in benchmark_gpu + _callable() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 856, in kernel_call + cloned_args, cloned_kwargs = self.maybe_clone_args( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 994, in maybe_clone_args + cloned_args = [ + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 995, in + prepare_arg(name, arg) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 990, in prepare_arg + return clone_preserve_strides(arg) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3037, in clone_preserve_strides + buffer = torch.as_strided(x, (needed_size,), (1,)).clone() +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 746.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 136.62 MiB is free. Process 58677 has 414.00 MiB memory in use. Process 58676 has 414.00 MiB memory in use. Process 58678 has 414.00 MiB memory in use. Process 58682 has 414.00 MiB memory in use. Process 58679 has 414.00 MiB memory in use. Process 58681 has 414.00 MiB memory in use. Process 58680 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.48 GiB memory in use. Of the allocated memory 34.81 GiB is allocated by PyTorch, and 592.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1685, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1466, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1026, in forward +[rank0]: out_list = self.backbone( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in torch_dynamo_resume_in_forward_at_471 +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped +[rank0]: return self._wrapped_call(self, *args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ +[rank0]: raise e +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ +[rank0]: return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File ".140", line 33, in forward +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/backends/distributed.py", line 213, in forward +[rank0]: x = self.submod(*args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward +[rank0]: return compiled_fn(full_args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 339, in runtime_wrapper +[rank0]: all_outs = call_func_at_runtime_with_args( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args +[rank0]: out = normalize_as_list(f(args)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 103, in g +[rank0]: return f(*args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/function.py", line 581, in apply +[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2118, in forward +[rank0]: fw_outs = call_func_at_runtime_with_args( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args +[rank0]: out = normalize_as_list(f(args)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 629, in wrapper +[rank0]: return compiled_fn(runtime_args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper +[rank0]: return compiled_fn(runtime_args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn +[rank0]: outs = compiled_fn(args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ +[rank0]: return self.current_callable(inputs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3017, in run +[rank0]: out = model(new_inputs) +[rank0]: File "/opt/dlami/nvme/tmp_user/torchinductor_ubuntu/i4/ci4fv7peoif3y74vjxhcaw2jns2xsolgu3lq4o3qolhl2zs34f5j.py", line 543, in call +[rank0]: triton_red_fused_add_addmm_mul_native_layer_norm_squeeze_view_5.run(buf16, buf20, primals_10, primals_9, primals_12, primals_11, buf17, buf21, 127200, 1536, stream=stream0) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1272, in run +[rank0]: self.autotune_to_one_config(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1048, in autotune_to_one_config +[rank0]: timings = self.benchmark_all_configs(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1022, in benchmark_all_configs +[rank0]: timings = { +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1023, in +[rank0]: launcher: self.bench(launcher, *args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 891, in bench +[rank0]: return benchmarker.benchmark_gpu(kernel_call, rep=40) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper +[rank0]: return fn(self, *args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 250, in benchmark_gpu +[rank0]: _callable() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 856, in kernel_call +[rank0]: cloned_args, cloned_kwargs = self.maybe_clone_args( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 994, in maybe_clone_args +[rank0]: cloned_args = [ +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 995, in +[rank0]: prepare_arg(name, arg) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 990, in prepare_arg +[rank0]: return clone_preserve_strides(arg) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3037, in clone_preserve_strides +[rank0]: buffer = torch.as_strided(x, (needed_size,), (1,)).clone() +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 746.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 136.62 MiB is free. Process 58677 has 414.00 MiB memory in use. Process 58676 has 414.00 MiB memory in use. Process 58678 has 414.00 MiB memory in use. Process 58682 has 414.00 MiB memory in use. Process 58679 has 414.00 MiB memory in use. Process 58681 has 414.00 MiB memory in use. Process 58680 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.48 GiB memory in use. Of the allocated memory 34.81 GiB is allocated by PyTorch, and 592.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/requirements.txt b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f2c3ea882d8ee13561ef4b36a94f07cdb4ca3e8e --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/wandb-metadata.json @@ -0,0 +1,102 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T14:08:07.626312Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093762543616" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "cau9z7jh4phx1m0wq4r3ub8xd0sumfyz" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/wandb-summary.json b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..65efeb016520ebca12e5019029f5a6f6602702bd --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":73},"_runtime":73} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-core.log b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ca37eb728d9fd18b80ee4aa5b2c62bfc0424a665 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T14:08:07.696352758Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpjw7ydas_/port-58675.txt","pid":58675,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T14:08:07.696873887Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":58675} +{"time":"2025-12-09T14:08:07.696843305Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-58675-58965-572711673/socket","Net":"unix"}} +{"time":"2025-12-09T14:08:07.882459638Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T14:08:07.889241598Z","level":"INFO","msg":"handleInformInit: received","streamId":"2xc2ptdw","id":"1(@)"} +{"time":"2025-12-09T14:08:08.583749275Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2xc2ptdw","id":"1(@)"} +{"time":"2025-12-09T14:09:22.287327877Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T14:09:22.287407746Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T14:09:22.287397988Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T14:09:22.287501059Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T14:09:22.287501873Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-58675-58965-572711673/socket","Net":"unix"}} +{"time":"2025-12-09T14:09:22.651880493Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T14:09:22.651899801Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T14:09:22.651909159Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-internal.log b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5e3e99b5e69e38a503db48f6ff147cd303e2c190 --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T14:08:07.889337777Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T14:08:08.583510509Z","level":"INFO","msg":"stream: created new stream","id":"2xc2ptdw"} +{"time":"2025-12-09T14:08:08.583604343Z","level":"INFO","msg":"handler: started","stream_id":"2xc2ptdw"} +{"time":"2025-12-09T14:08:08.583742797Z","level":"INFO","msg":"stream: started","id":"2xc2ptdw"} +{"time":"2025-12-09T14:08:08.583756217Z","level":"INFO","msg":"writer: started","stream_id":"2xc2ptdw"} +{"time":"2025-12-09T14:08:08.583763508Z","level":"INFO","msg":"sender: started","stream_id":"2xc2ptdw"} +{"time":"2025-12-09T14:09:22.287406211Z","level":"INFO","msg":"stream: closing","id":"2xc2ptdw"} +{"time":"2025-12-09T14:09:22.52809835Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T14:09:22.648874259Z","level":"INFO","msg":"handler: closed","stream_id":"2xc2ptdw"} +{"time":"2025-12-09T14:09:22.648958362Z","level":"INFO","msg":"sender: closed","stream_id":"2xc2ptdw"} +{"time":"2025-12-09T14:09:22.648969064Z","level":"INFO","msg":"stream: closed","id":"2xc2ptdw"} diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug.log b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e64e60f8226a04d8ea3bc535cbe8eb19a1ec509b --- /dev/null +++ b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug.log @@ -0,0 +1,37 @@ +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_setup.py:_flush():80] Configure stats pid to 58675 +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug.log +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_140807-2xc2ptdw/logs/debug-internal.log +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_init.py:init():841] calling init triggers +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 14:08:07,629 INFO MainThread:58675 [wandb_init.py:init():889] starting backend +2025-12-09 14:08:07,882 INFO MainThread:58675 [wandb_init.py:init():892] sending inform_init request +2025-12-09 14:08:07,887 INFO MainThread:58675 [wandb_init.py:init():900] backend started and connected +2025-12-09 14:08:07,890 INFO MainThread:58675 [wandb_init.py:init():970] updated telemetry +2025-12-09 14:08:07,895 INFO MainThread:58675 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 14:08:08,800 INFO MainThread:58675 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 14:08:08,923 INFO MainThread:58675 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 14:08:08,923 INFO MainThread:58675 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 14:08:08,923 INFO MainThread:58675 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 14:08:08,923 INFO MainThread:58675 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 14:08:08,926 INFO MainThread:58675 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 14:08:08,927 INFO MainThread:58675 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': None, 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': None, 'instance_data_image': None, 'use_8bit_adam': False, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': 2, 'allow_tf32': False, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': 'muse_training', 'seed': None, 'logging_dir': 'logs', 'max_train_steps': None, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 100, 'mixed_precision': None, 'report_to': 'wandb', 'validation_prompts': None, 'resolution': 512, 'split_vae_encode': None, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': False, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.1, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 14:09:22,287 INFO wandb-AsyncioManager-main:58675 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 14:09:22,287 INFO wandb-AsyncioManager-main:58675 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. +2025-12-09 14:09:23,401 ERROR wandb-AsyncioManager-main:58675 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback. +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions + await fn() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish + await self._send_server_request(request) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request + await self._writer.drain() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 371, in drain + await self._protocol._drain_helper() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/asyncio/streams.py", line 167, in _drain_helper + raise ConnectionResetError('Connection lost') +ConnectionResetError: Connection lost diff --git a/Meissonic/wandb/run-20251209_140807-2xc2ptdw/run-2xc2ptdw.wandb b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/run-2xc2ptdw.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a7dbcc1e4fbd4089384dce1cc03b9ec8ecc399a2 Binary files /dev/null and b/Meissonic/wandb/run-20251209_140807-2xc2ptdw/run-2xc2ptdw.wandb differ diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/config.yaml b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cb42651aa401f0850266b3e4c0e47227d88c3701 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/config.yaml @@ -0,0 +1,245 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 6yeky5r0m22mgji4j68ozyjfe5hn9j2a: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093762883584" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T14:14:05.817929Z" + writerId: 6yeky5r0m22mgji4j68ozyjfe5hn9j2a + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: false +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0 +dataloader_num_workers: + value: 4 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: null +instance_data_image: + value: null +instance_dataset: + value: null +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 50 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 500 +max_grad_norm: + value: 50 +max_train_steps: + value: null +min_masking_rate: + value: 0 +mixed_precision: + value: null +num_frames: + value: 16 +output_dir: + value: muse_training +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: null +split_vae_encode: + value: null +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 16 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: false +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: null +validation_steps: + value: 100 +variant: + value: null +video_height: + value: 480 +video_tokenizer_model_id: + value: Cosmos-1.0-Tokenizer-DV8x16x16 +video_width: + value: 848 +wan_backbone_lr_ratio: + value: 0.1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/output.log b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f4fc892ca73a17eb7c5ca6e2663edb3a0da306ea --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/output.log @@ -0,0 +1,277 @@ +12/09/2025 14:14:07 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 14:14:07 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 14:14:07 - INFO - __main__ - Loading minimal video tokenizer config to get mask_token_id and codebook_size... +12/09/2025 14:14:07 - INFO - __main__ - Loading video tokenizer temporarily to get mask_token_id and codebook_size... +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9855.70it/s] +12/09/2025 14:14:07 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 14:14:07 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 14:14:07 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 14:14:07 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 14:14:07 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:14:08 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 14:14:24 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:14:24 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:14:26 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 14:14:26 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 14:14:26 - INFO - __main__ - Wan backbone lr = 0.000030 (base_lr * 0.1) +12/09/2025 14:14:26 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 14:14:26 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 14:14:26 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 14:14:26 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 14:14:26 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 14:14:26 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 14:14:26 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 14:14:26 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/09/2025 14:14:26 - INFO - __main__ - Dataloader configuration: +12/09/2025 14:14:26 - INFO - __main__ - - num_workers: 4 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 14:14:26 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 14:14:26 - INFO - __main__ - - persistent_workers: True +12/09/2025 14:14:26 - INFO - __main__ - - pin_memory: True +12/09/2025 14:14:26 - WARNING - __main__ - max_train_steps not specified, defaulting to 1 epoch (8 steps) +12/09/2025 14:14:26 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 14:14:28 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 14:14:28 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 14:14:28 - INFO - __main__ - ***** Running training ***** +12/09/2025 14:14:28 - INFO - __main__ - Num training steps = 8 +12/09/2025 14:14:28 - INFO - __main__ - Instantaneous batch size per device = 16 +12/09/2025 14:14:28 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 128 +12/09/2025 14:14:28 - INFO - __main__ - Gradient Accumulation steps = 1 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1717, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1498, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1026, in forward + out_list = self.backbone( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in torch_dynamo_resume_in_forward_at_471 + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped + return self._wrapped_call(self, *args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ + raise e + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ + return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File ".138", line 33, in forward + submod_1 = self.compiled_submod_1(l_context_, l_self_modules_cross_attn_modules_k_parameters_weight_, l_self_modules_cross_attn_modules_k_parameters_bias_, l_self_modules_cross_attn_modules_norm_k_parameters_weight_, l_self_modules_cross_attn_modules_v_parameters_weight_, l_self_modules_cross_attn_modules_v_parameters_bias_, getitem, l_self_modules_cross_attn_modules_o_parameters_weight_, l_self_modules_cross_attn_modules_o_parameters_bias_, getitem_1, l_e_3_, l_e_4_, l_e_5_); l_context_ = l_self_modules_cross_attn_modules_k_parameters_weight_ = l_self_modules_cross_attn_modules_k_parameters_bias_ = l_self_modules_cross_attn_modules_norm_k_parameters_weight_ = l_self_modules_cross_attn_modules_v_parameters_weight_ = l_self_modules_cross_attn_modules_v_parameters_bias_ = getitem = l_self_modules_cross_attn_modules_o_parameters_weight_ = l_self_modules_cross_attn_modules_o_parameters_bias_ = getitem_1 = l_e_3_ = l_e_4_ = l_e_5_ = None + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/backends/distributed.py", line 213, in forward + x = self.submod(*args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward + return compiled_fn(full_args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 339, in runtime_wrapper + all_outs = call_func_at_runtime_with_args( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args + out = normalize_as_list(f(args)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 103, in g + return f(*args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/function.py", line 581, in apply + return super().apply(*args, **kwargs) # type: ignore[misc] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2118, in forward + fw_outs = call_func_at_runtime_with_args( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args + out = normalize_as_list(f(args)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 629, in wrapper + return compiled_fn(runtime_args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper + return compiled_fn(runtime_args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn + outs = compiled_fn(args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ + return self.current_callable(inputs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3017, in run + out = model(new_inputs) + File "/opt/dlami/nvme/tmp_user/torchinductor_ubuntu/i4/ci4fv7peoif3y74vjxhcaw2jns2xsolgu3lq4o3qolhl2zs34f5j.py", line 543, in call + triton_red_fused_add_addmm_mul_native_layer_norm_squeeze_view_5.run(buf16, buf20, primals_10, primals_9, primals_12, primals_11, buf17, buf21, 127200, 1536, stream=stream0) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1272, in run + self.autotune_to_one_config(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1048, in autotune_to_one_config + timings = self.benchmark_all_configs(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1022, in benchmark_all_configs + timings = { + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1023, in + launcher: self.bench(launcher, *args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 891, in bench + return benchmarker.benchmark_gpu(kernel_call, rep=40) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper + return fn(self, *args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 250, in benchmark_gpu + _callable() + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 856, in kernel_call + cloned_args, cloned_kwargs = self.maybe_clone_args( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 994, in maybe_clone_args + cloned_args = [ + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 995, in + prepare_arg(name, arg) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 990, in prepare_arg + return clone_preserve_strides(arg) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3037, in clone_preserve_strides + buffer = torch.as_strided(x, (needed_size,), (1,)).clone() +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 746.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 136.62 MiB is free. Process 68014 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.48 GiB memory in use. Process 68012 has 414.00 MiB memory in use. Process 68011 has 414.00 MiB memory in use. Process 68013 has 414.00 MiB memory in use. Process 68016 has 414.00 MiB memory in use. Process 68010 has 414.00 MiB memory in use. Process 68015 has 414.00 MiB memory in use. Of the allocated memory 34.81 GiB is allocated by PyTorch, and 592.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1717, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1498, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1026, in forward +[rank0]: out_list = self.backbone( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in torch_dynamo_resume_in_forward_at_471 +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 837, in call_wrapped +[rank0]: return self._wrapped_call(self, *args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 413, in __call__ +[rank0]: raise e +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/fx/graph_module.py", line 400, in __call__ +[rank0]: return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File ".138", line 33, in forward +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/backends/distributed.py", line 213, in forward +[rank0]: x = self.submod(*args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1130, in forward +[rank0]: return compiled_fn(full_args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 339, in runtime_wrapper +[rank0]: all_outs = call_func_at_runtime_with_args( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args +[rank0]: out = normalize_as_list(f(args)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 103, in g +[rank0]: return f(*args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/function.py", line 581, in apply +[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2118, in forward +[rank0]: fw_outs = call_func_at_runtime_with_args( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 129, in call_func_at_runtime_with_args +[rank0]: out = normalize_as_list(f(args)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 629, in wrapper +[rank0]: return compiled_fn(runtime_args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 526, in wrapper +[rank0]: return compiled_fn(runtime_args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 724, in inner_fn +[rank0]: outs = compiled_fn(args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/output_code.py", line 613, in __call__ +[rank0]: return self.current_callable(inputs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3017, in run +[rank0]: out = model(new_inputs) +[rank0]: File "/opt/dlami/nvme/tmp_user/torchinductor_ubuntu/i4/ci4fv7peoif3y74vjxhcaw2jns2xsolgu3lq4o3qolhl2zs34f5j.py", line 543, in call +[rank0]: triton_red_fused_add_addmm_mul_native_layer_norm_squeeze_view_5.run(buf16, buf20, primals_10, primals_9, primals_12, primals_11, buf17, buf21, 127200, 1536, stream=stream0) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1272, in run +[rank0]: self.autotune_to_one_config(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1048, in autotune_to_one_config +[rank0]: timings = self.benchmark_all_configs(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1022, in benchmark_all_configs +[rank0]: timings = { +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 1023, in +[rank0]: launcher: self.bench(launcher, *args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 891, in bench +[rank0]: return benchmarker.benchmark_gpu(kernel_call, rep=40) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper +[rank0]: return fn(self, *args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.py", line 250, in benchmark_gpu +[rank0]: _callable() +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 856, in kernel_call +[rank0]: cloned_args, cloned_kwargs = self.maybe_clone_args( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 994, in maybe_clone_args +[rank0]: cloned_args = [ +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 995, in +[rank0]: prepare_arg(name, arg) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/runtime/triton_heuristics.py", line 990, in prepare_arg +[rank0]: return clone_preserve_strides(arg) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_inductor/utils.py", line 3037, in clone_preserve_strides +[rank0]: buffer = torch.as_strided(x, (needed_size,), (1,)).clone() +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 746.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 136.62 MiB is free. Process 68014 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 36.48 GiB memory in use. Process 68012 has 414.00 MiB memory in use. Process 68011 has 414.00 MiB memory in use. Process 68013 has 414.00 MiB memory in use. Process 68016 has 414.00 MiB memory in use. Process 68010 has 414.00 MiB memory in use. Process 68015 has 414.00 MiB memory in use. Of the allocated memory 34.81 GiB is allocated by PyTorch, and 592.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/requirements.txt b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b744d390864719718398117fdd944e687d270574 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/wandb-metadata.json @@ -0,0 +1,102 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T14:14:05.817929Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093762883584" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "6yeky5r0m22mgji4j68ozyjfe5hn9j2a" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/wandb-summary.json b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a3e80f52165bcc0e5fdf4a7be18b9b5db42e53a8 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":57,"_wandb":{"runtime":57}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-core.log b/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..e56c66ac1e53d8372b8b4b42c4d1577dada154b6 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T14:14:05.889317786Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpm9xxyxn_/port-68009.txt","pid":68009,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T14:14:05.88993588Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":68009} +{"time":"2025-12-09T14:14:05.889907813Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-68009-68307-4265498081/socket","Net":"unix"}} +{"time":"2025-12-09T14:14:06.073781124Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T14:14:06.079762596Z","level":"INFO","msg":"handleInformInit: received","streamId":"x4hogfbn","id":"1(@)"} +{"time":"2025-12-09T14:14:06.847672609Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"x4hogfbn","id":"1(@)"} +{"time":"2025-12-09T14:15:04.374591532Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T14:15:04.374771022Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T14:15:04.374793745Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T14:15:04.374873463Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T14:15:04.374929441Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-68009-68307-4265498081/socket","Net":"unix"}} +{"time":"2025-12-09T14:15:04.807072953Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T14:15:04.80709144Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T14:15:04.807100361Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-internal.log b/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..58ec434b3bcd95996b8615c1f4ec0e30b9048346 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T14:14:06.079873033Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T14:14:06.847376958Z","level":"INFO","msg":"stream: created new stream","id":"x4hogfbn"} +{"time":"2025-12-09T14:14:06.847456859Z","level":"INFO","msg":"handler: started","stream_id":"x4hogfbn"} +{"time":"2025-12-09T14:14:06.847572206Z","level":"INFO","msg":"sender: started","stream_id":"x4hogfbn"} +{"time":"2025-12-09T14:14:06.847560437Z","level":"INFO","msg":"writer: started","stream_id":"x4hogfbn"} +{"time":"2025-12-09T14:14:06.847536307Z","level":"INFO","msg":"stream: started","id":"x4hogfbn"} +{"time":"2025-12-09T14:15:04.374776579Z","level":"INFO","msg":"stream: closing","id":"x4hogfbn"} +{"time":"2025-12-09T14:15:04.650892563Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T14:15:04.803844803Z","level":"INFO","msg":"handler: closed","stream_id":"x4hogfbn"} +{"time":"2025-12-09T14:15:04.803956881Z","level":"INFO","msg":"sender: closed","stream_id":"x4hogfbn"} +{"time":"2025-12-09T14:15:04.803963926Z","level":"INFO","msg":"stream: closed","id":"x4hogfbn"} diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug.log b/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f0bb32dc1987fbb207fd3380466826c47de1cb0c --- /dev/null +++ b/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_setup.py:_flush():80] Configure stats pid to 68009 +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug.log +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_141405-x4hogfbn/logs/debug-internal.log +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_init.py:init():841] calling init triggers +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 14:14:05,820 INFO MainThread:68009 [wandb_init.py:init():889] starting backend +2025-12-09 14:14:06,074 INFO MainThread:68009 [wandb_init.py:init():892] sending inform_init request +2025-12-09 14:14:06,078 INFO MainThread:68009 [wandb_init.py:init():900] backend started and connected +2025-12-09 14:14:06,079 INFO MainThread:68009 [wandb_init.py:init():970] updated telemetry +2025-12-09 14:14:06,083 INFO MainThread:68009 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 14:14:07,064 INFO MainThread:68009 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 14:14:07,191 INFO MainThread:68009 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 14:14:07,191 INFO MainThread:68009 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 14:14:07,191 INFO MainThread:68009 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 14:14:07,191 INFO MainThread:68009 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 14:14:07,194 INFO MainThread:68009 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 14:14:07,195 INFO MainThread:68009 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': None, 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': None, 'instance_data_image': None, 'use_8bit_adam': False, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': 2, 'allow_tf32': False, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': 'muse_training', 'seed': None, 'logging_dir': 'logs', 'max_train_steps': None, 'checkpointing_steps': 500, 'logging_steps': 50, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 16, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 500, 'validation_steps': 100, 'mixed_precision': None, 'report_to': 'wandb', 'validation_prompts': None, 'resolution': 512, 'split_vae_encode': None, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': False, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 480, 'video_width': 848, 'video_tokenizer_model_id': 'Cosmos-1.0-Tokenizer-DV8x16x16', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.1, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 14:15:04,374 INFO wandb-AsyncioManager-main:68009 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 14:15:04,375 INFO wandb-AsyncioManager-main:68009 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_141405-x4hogfbn/run-x4hogfbn.wandb b/Meissonic/wandb/run-20251209_141405-x4hogfbn/run-x4hogfbn.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1cc51377dcf97bf04b6719c301b1468d44156774 Binary files /dev/null and b/Meissonic/wandb/run-20251209_141405-x4hogfbn/run-x4hogfbn.wandb differ diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/config.yaml b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..895cfa4ed779ada88f03bac12ccc4bdbc12dd427 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/config.yaml @@ -0,0 +1,301 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + kb0bzb40td6ovt07v98pjfl246wmk2wf: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12093763047424" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T14:16:18.415636Z" + writerId: kb0bzb40td6ovt07v98pjfl246wmk2wf + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/output.log b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5447bc5f8b12927544d46288ec6fc82a8bec1b70 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/output.log @@ -0,0 +1,79 @@ +12/09/2025 14:16:19 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 14:16:19 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 14:16:19 - INFO - __main__ - Loading minimal video tokenizer config to get mask_token_id and codebook_size... +12/09/2025 14:16:19 - INFO - __main__ - Loading video tokenizer temporarily to get mask_token_id and codebook_size... +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9362.29it/s] +12/09/2025 14:16:20 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 14:16:20 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 14:16:20 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 14:16:20 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 14:16:20 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:16:20 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 14:16:37 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:16:37 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:16:39 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 14:16:41 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 14:16:41 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 14:16:41 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 14:16:41 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 14:16:41 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 14:16:41 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 14:16:41 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 14:16:41 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 14:16:41 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 14:16:41 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/09/2025 14:16:41 - INFO - __main__ - Dataloader configuration: +12/09/2025 14:16:41 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 14:16:41 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 14:16:41 - INFO - __main__ - - persistent_workers: True +12/09/2025 14:16:41 - INFO - __main__ - - pin_memory: True +12/09/2025 14:16:41 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 14:16:43 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 14:16:43 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 14:16:43 - INFO - __main__ - ***** Running training ***** +12/09/2025 14:16:43 - INFO - __main__ - Num training steps = 10000 +12/09/2025 14:16:43 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 14:16:43 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/09/2025 14:16:43 - INFO - __main__ - Gradient Accumulation steps = 1 +12/09/2025 14:16:44 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1717, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1534, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.58 GiB. GPU 0 has a total capacity of 39.49 GiB of which 7.23 GiB is free. Process 70750 has 414.00 MiB memory in use. Process 70753 has 414.00 MiB memory in use. Process 70749 has 414.00 MiB memory in use. Process 70754 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 29.39 GiB memory in use. Process 70752 has 414.00 MiB memory in use. Process 70748 has 414.00 MiB memory in use. Process 70751 has 414.00 MiB memory in use. Of the allocated memory 28.07 GiB is allocated by PyTorch, and 234.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1717, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1534, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.58 GiB. GPU 0 has a total capacity of 39.49 GiB of which 7.23 GiB is free. Process 70750 has 414.00 MiB memory in use. Process 70753 has 414.00 MiB memory in use. Process 70749 has 414.00 MiB memory in use. Process 70754 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 29.39 GiB memory in use. Process 70752 has 414.00 MiB memory in use. Process 70748 has 414.00 MiB memory in use. Process 70751 has 414.00 MiB memory in use. Of the allocated memory 28.07 GiB is allocated by PyTorch, and 234.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1648, in _clean_up_worker + w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/process.py", line 149, in join + res = self._popen.wait(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait + if not wait([self.sentinel], timeout): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt: diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/requirements.txt b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..00478088d19afd60c3b885136e8859d8ff427a49 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T14:16:18.415636Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093763047424" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "kb0bzb40td6ovt07v98pjfl246wmk2wf" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/wandb-summary.json b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8e8b56dc1e46b00cf7f7377c59e01f228d41fe3f --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":62,"_wandb":{"runtime":62}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-core.log b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..74662a552ddef4b9b28334a85abbc20fc8b10d58 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T14:16:18.48416748Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp1m1_z9x0/port-70747.txt","pid":70747,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T14:16:18.48459797Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":70747} +{"time":"2025-12-09T14:16:18.484606503Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-70747-71068-298534146/socket","Net":"unix"}} +{"time":"2025-12-09T14:16:18.671246579Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T14:16:18.677309787Z","level":"INFO","msg":"handleInformInit: received","streamId":"v5rp4n5k","id":"1(@)"} +{"time":"2025-12-09T14:16:19.246997822Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"v5rp4n5k","id":"1(@)"} +{"time":"2025-12-09T14:17:21.807072348Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T14:17:21.807124574Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T14:17:21.807121112Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T14:17:21.80732553Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-70747-71068-298534146/socket","Net":"unix"}} +{"time":"2025-12-09T14:17:21.807357877Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T14:17:22.230073379Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T14:17:22.230098436Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T14:17:22.23010832Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-internal.log b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3f9845251800ac24a906597a7a46535a1ad1624e --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T14:16:18.677456814Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T14:16:19.246695362Z","level":"INFO","msg":"stream: created new stream","id":"v5rp4n5k"} +{"time":"2025-12-09T14:16:19.246790913Z","level":"INFO","msg":"handler: started","stream_id":"v5rp4n5k"} +{"time":"2025-12-09T14:16:19.246950858Z","level":"INFO","msg":"writer: started","stream_id":"v5rp4n5k"} +{"time":"2025-12-09T14:16:19.246919928Z","level":"INFO","msg":"stream: started","id":"v5rp4n5k"} +{"time":"2025-12-09T14:16:19.246954337Z","level":"INFO","msg":"sender: started","stream_id":"v5rp4n5k"} +{"time":"2025-12-09T14:17:21.807132231Z","level":"INFO","msg":"stream: closing","id":"v5rp4n5k"} +{"time":"2025-12-09T14:17:22.084826724Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T14:17:22.226568945Z","level":"INFO","msg":"handler: closed","stream_id":"v5rp4n5k"} +{"time":"2025-12-09T14:17:22.226683646Z","level":"INFO","msg":"sender: closed","stream_id":"v5rp4n5k"} +{"time":"2025-12-09T14:17:22.226691201Z","level":"INFO","msg":"stream: closed","id":"v5rp4n5k"} diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug.log b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ec28e257839b1b934107d99c46c88719436fffd6 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_setup.py:_flush():80] Configure stats pid to 70747 +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug.log +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_141618-v5rp4n5k/logs/debug-internal.log +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_init.py:init():841] calling init triggers +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 14:16:18,418 INFO MainThread:70747 [wandb_init.py:init():889] starting backend +2025-12-09 14:16:18,671 INFO MainThread:70747 [wandb_init.py:init():892] sending inform_init request +2025-12-09 14:16:18,675 INFO MainThread:70747 [wandb_init.py:init():900] backend started and connected +2025-12-09 14:16:18,677 INFO MainThread:70747 [wandb_init.py:init():970] updated telemetry +2025-12-09 14:16:18,681 INFO MainThread:70747 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 14:16:19,460 INFO MainThread:70747 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 14:16:19,585 INFO MainThread:70747 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 14:16:19,585 INFO MainThread:70747 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 14:16:19,585 INFO MainThread:70747 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 14:16:19,585 INFO MainThread:70747 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 14:16:19,589 INFO MainThread:70747 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 14:16:19,589 INFO MainThread:70747 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 14:17:21,807 INFO wandb-AsyncioManager-main:70747 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 14:17:21,807 INFO wandb-AsyncioManager-main:70747 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_141618-v5rp4n5k/run-v5rp4n5k.wandb b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/run-v5rp4n5k.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4cb0daf65fbfbfe5223fb1f4a2a95d8c12f2825a Binary files /dev/null and b/Meissonic/wandb/run-20251209_141618-v5rp4n5k/run-v5rp4n5k.wandb differ diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_7b2c7dbea7c77c3a3523.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_7b2c7dbea7c77c3a3523.png new file mode 100644 index 0000000000000000000000000000000000000000..fe9d0c71e1bb9e1427dd51b01336e7555f94cfd1 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_7b2c7dbea7c77c3a3523.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2c7dbea7c77c3a35238f0fb5e799c84a960eb7fb86424199551945ee4bd12f +size 142225 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_d3b01b8e129b539a85ed.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_d3b01b8e129b539a85ed.png new file mode 100644 index 0000000000000000000000000000000000000000..4a80cb85e634b71c019cb30a98026cef72b6de7a --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_d3b01b8e129b539a85ed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b01b8e129b539a85ed3a366f5dda9d8a2ddeeaf4c86fdf85666b1fed702412 +size 151513 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_287117d5d7643ba31ec4.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_287117d5d7643ba31ec4.png new file mode 100644 index 0000000000000000000000000000000000000000..446ff5a0efc460653a3ba2816a1aba0f75f004a8 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_287117d5d7643ba31ec4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287117d5d7643ba31ec4edb4220888e3e3039632e354ab390fc18b48722eeabe +size 144805 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_f6b18ba278e34d44baab.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_f6b18ba278e34d44baab.png new file mode 100644 index 0000000000000000000000000000000000000000..d7280ec4c28cbe24e7c9af972e31ea0de2755f94 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_f6b18ba278e34d44baab.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b18ba278e34d44baabc384a855c5179e8a1bada0a2a0278bdeb1ca432f1ac5 +size 153161 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_321720abba124381620b.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_321720abba124381620b.png new file mode 100644 index 0000000000000000000000000000000000000000..062e91e05b7b5d2f115315e2a3cc6fd2273eebcb --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_321720abba124381620b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:321720abba124381620b47e9787cba8eb803ad1c4072172fd3800750374d7536 +size 142016 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_fa7af054654656754134.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_fa7af054654656754134.png new file mode 100644 index 0000000000000000000000000000000000000000..9588194ccc88b3fb9f50423b2b9102d25120cf06 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_fa7af054654656754134.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa7af05465465675413476e517f43ff87b883cbf3a4bf2608c4081c4d58d8929 +size 153715 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_e6c1efef5a74bd11c582.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_e6c1efef5a74bd11c582.png new file mode 100644 index 0000000000000000000000000000000000000000..3460543dcd7c86f9810c764bee6527374572eaa6 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_e6c1efef5a74bd11c582.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c1efef5a74bd11c5828c14cea7fa6bf3a30be1e7802808372a2a9a6dc70570 +size 146709 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_f00b3e2c752ac3cf926a.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_f00b3e2c752ac3cf926a.png new file mode 100644 index 0000000000000000000000000000000000000000..9dea257094b6aa0266a742ee5054d82139931f79 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_f00b3e2c752ac3cf926a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00b3e2c752ac3cf926abd1ea3f05edd8971039f15c2942bf0f1b0f461474d20 +size 139409 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_67d5ba7897e123897b95.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_67d5ba7897e123897b95.png new file mode 100644 index 0000000000000000000000000000000000000000..b056ae0ee00d37c8788b9b287aeb3c3e4750a57c --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_67d5ba7897e123897b95.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d5ba7897e123897b95025d14df4dec47f70dce18f06b438e40a2d23849dba2 +size 149238 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_9c128d777c7dab549107.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_9c128d777c7dab549107.png new file mode 100644 index 0000000000000000000000000000000000000000..cb792e41b902b2ff19190ae6ee9ec5107b3b2152 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_9c128d777c7dab549107.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c128d777c7dab5491071e2e6a43807bdb349d7bab1d8f8426085cc6619d8446 +size 141809 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_4274b237825ef8cf5d05.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_4274b237825ef8cf5d05.png new file mode 100644 index 0000000000000000000000000000000000000000..43c10cec82a705101e7be098b137740f7bb33d3d --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_4274b237825ef8cf5d05.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4274b237825ef8cf5d05dfb41e4ed36fd5fe68ed60a35d9c9a8f93e75a3e671a +size 140719 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_de7aecbbb4729ab5af9d.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_de7aecbbb4729ab5af9d.png new file mode 100644 index 0000000000000000000000000000000000000000..f6cdc3dcb18bc50f8fedb6536d2dcabcb5c7c203 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_de7aecbbb4729ab5af9d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7aecbbb4729ab5af9d55510c945bfc145abcc757a87d55a726687e2aca4841 +size 145540 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_09fa45bbfff36049e141.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_09fa45bbfff36049e141.png new file mode 100644 index 0000000000000000000000000000000000000000..39ad914a2ae6a66a4c753749ab5b532aa800f81c --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_09fa45bbfff36049e141.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09fa45bbfff36049e1418e1c56d43cf7cde9f6e74b57269422bd9132aee4b135 +size 162875 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_d8fc778d368d5c2cb79c.png b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_d8fc778d368d5c2cb79c.png new file mode 100644 index 0000000000000000000000000000000000000000..4279bc6a87471945e644d0aaa512d4e46b42424b --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_d8fc778d368d5c2cb79c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fc778d368d5c2cb79c7b208b465f91f0df0b3653679648d8f4ba8d3d0ebeaf +size 172462 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/output.log b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..eb25380dc7f24da8c6f954dcd4815ef0c539d644 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/output.log @@ -0,0 +1,569 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 47.74it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 14:17:43 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8237.97it/s] +12/09/2025 14:17:44 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/09/2025 14:17:44 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/09/2025 14:17:44 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:17:45 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 14:18:01 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:18:01 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 14:18:03 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 14:18:04 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 14:18:04 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 14:18:04 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 14:18:04 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 14:18:11 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 14:18:11 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 14:18:11 - INFO - __main__ - Dataloader configuration: +12/09/2025 14:18:11 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 14:18:11 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 14:18:11 - INFO - __main__ - - persistent_workers: True +12/09/2025 14:18:11 - INFO - __main__ - - pin_memory: True +12/09/2025 14:18:11 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 14:18:28 - INFO - __main__ - ***** Running training ***** +12/09/2025 14:18:28 - INFO - __main__ - Num training steps = 10000 +12/09/2025 14:18:28 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 14:18:28 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/09/2025 14:18:28 - INFO - __main__ - Gradient Accumulation steps = 1 +12/09/2025 14:18:44 - INFO - __main__ - Step: 10 Loss: 11.0786 LR: 0.000300 +12/09/2025 14:18:52 - INFO - __main__ - Step: 20 Loss: 11.0749 LR: 0.000300 +12/09/2025 14:19:00 - INFO - __main__ - Step: 30 Loss: 11.0746 LR: 0.000300 +12/09/2025 14:19:07 - INFO - __main__ - Step: 40 Loss: 11.0719 LR: 0.000300 +12/09/2025 14:19:14 - INFO - __main__ - Step: 50 Loss: 11.0722 LR: 0.000300 +12/09/2025 14:19:22 - INFO - __main__ - Step: 60 Loss: 11.0700 LR: 0.000300 +12/09/2025 14:19:29 - INFO - __main__ - Step: 70 Loss: 11.0685 LR: 0.000300 +12/09/2025 14:19:37 - INFO - __main__ - Step: 80 Loss: 11.0656 LR: 0.000300 +12/09/2025 14:19:44 - INFO - __main__ - Step: 90 Loss: 11.0620 LR: 0.000300 +12/09/2025 14:19:52 - INFO - __main__ - Step: 100 Loss: 11.0578 LR: 0.000300 +12/09/2025 14:19:59 - INFO - __main__ - Step: 110 Loss: 11.0488 LR: 0.000300 +12/09/2025 14:20:07 - INFO - __main__ - Step: 120 Loss: 11.0397 LR: 0.000300 +12/09/2025 14:20:14 - INFO - __main__ - Step: 130 Loss: 11.0290 LR: 0.000300 +12/09/2025 14:20:22 - INFO - __main__ - Step: 140 Loss: 11.0018 LR: 0.000300 +12/09/2025 14:20:29 - INFO - __main__ - Step: 150 Loss: 10.9748 LR: 0.000300 +12/09/2025 14:20:36 - INFO - __main__ - Step: 160 Loss: 10.9484 LR: 0.000300 +12/09/2025 14:20:44 - INFO - __main__ - Step: 170 Loss: 10.9394 LR: 0.000300 +12/09/2025 14:20:51 - INFO - __main__ - Step: 180 Loss: 10.8992 LR: 0.000300 +12/09/2025 14:20:58 - INFO - __main__ - Step: 190 Loss: 10.8861 LR: 0.000300 +12/09/2025 14:21:06 - INFO - __main__ - Step: 200 Loss: 10.8595 LR: 0.000300 +12/09/2025 14:21:13 - INFO - __main__ - Step: 210 Loss: 10.8192 LR: 0.000300 +12/09/2025 14:21:20 - INFO - __main__ - Step: 220 Loss: 10.8125 LR: 0.000300 +12/09/2025 14:21:28 - INFO - __main__ - Step: 230 Loss: 10.8000 LR: 0.000300 +12/09/2025 14:21:36 - INFO - __main__ - Step: 240 Loss: 10.7867 LR: 0.000300 +12/09/2025 14:21:43 - INFO - __main__ - Step: 250 Loss: 10.7790 LR: 0.000300 +12/09/2025 14:21:51 - INFO - __main__ - Step: 260 Loss: 10.7543 LR: 0.000300 +12/09/2025 14:21:58 - INFO - __main__ - Step: 270 Loss: 10.7506 LR: 0.000300 +12/09/2025 14:22:06 - INFO - __main__ - Step: 280 Loss: 10.7433 LR: 0.000300 +12/09/2025 14:22:14 - INFO - __main__ - Step: 290 Loss: 10.7032 LR: 0.000300 +12/09/2025 14:22:21 - INFO - __main__ - Step: 300 Loss: 10.6494 LR: 0.000300 +12/09/2025 14:22:28 - INFO - __main__ - Step: 310 Loss: 10.6978 LR: 0.000300 +12/09/2025 14:22:36 - INFO - __main__ - Step: 320 Loss: 10.7025 LR: 0.000300 +12/09/2025 14:22:44 - INFO - __main__ - Step: 330 Loss: 10.6732 LR: 0.000300 +12/09/2025 14:22:51 - INFO - __main__ - Step: 340 Loss: 10.6721 LR: 0.000300 +12/09/2025 14:22:59 - INFO - __main__ - Step: 350 Loss: 10.6566 LR: 0.000300 +12/09/2025 14:23:07 - INFO - __main__ - Step: 360 Loss: 10.6685 LR: 0.000300 +12/09/2025 14:23:16 - INFO - __main__ - Step: 370 Loss: 10.5958 LR: 0.000300 +12/09/2025 14:23:23 - INFO - __main__ - Step: 380 Loss: 10.6540 LR: 0.000300 +12/09/2025 14:23:32 - INFO - __main__ - Step: 390 Loss: 10.6569 LR: 0.000300 +12/09/2025 14:23:40 - INFO - __main__ - Step: 400 Loss: 10.5789 LR: 0.000300 +12/09/2025 14:23:48 - INFO - __main__ - Step: 410 Loss: 10.5968 LR: 0.000300 +12/09/2025 14:23:57 - INFO - __main__ - Step: 420 Loss: 10.5736 LR: 0.000300 +12/09/2025 14:24:05 - INFO - __main__ - Step: 430 Loss: 10.6496 LR: 0.000300 +12/09/2025 14:24:13 - INFO - __main__ - Step: 440 Loss: 10.5696 LR: 0.000300 +12/09/2025 14:24:20 - INFO - __main__ - Step: 450 Loss: 10.5662 LR: 0.000300 +12/09/2025 14:24:28 - INFO - __main__ - Step: 460 Loss: 10.5439 LR: 0.000300 +12/09/2025 14:24:36 - INFO - __main__ - Step: 470 Loss: 10.5309 LR: 0.000300 +12/09/2025 14:24:45 - INFO - __main__ - Step: 480 Loss: 10.5726 LR: 0.000300 +12/09/2025 14:24:53 - INFO - __main__ - Step: 490 Loss: 10.5672 LR: 0.000300 +12/09/2025 14:25:02 - INFO - __main__ - Step: 500 Loss: 10.5413 LR: 0.000300 +12/09/2025 14:25:02 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500 +12/09/2025 14:25:11 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/optimizer.bin +12/09/2025 14:25:11 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/scheduler.bin +12/09/2025 14:25:11 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/sampler.bin +12/09/2025 14:25:11 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500/random_states_0.pkl +12/09/2025 14:25:11 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-500 +12/09/2025 14:25:11 - INFO - __main__ - Generating videos for validation... +12/09/2025 14:25:11 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.60it/s] +12/09/2025 14:25:18 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 14:25:28 - INFO - __main__ - Step: 510 Loss: 10.5050 LR: 0.000300 +12/09/2025 14:25:36 - INFO - __main__ - Step: 520 Loss: 10.5404 LR: 0.000300 +12/09/2025 14:25:44 - INFO - __main__ - Step: 530 Loss: 10.5635 LR: 0.000300 +12/09/2025 14:25:52 - INFO - __main__ - Step: 540 Loss: 10.4973 LR: 0.000300 +12/09/2025 14:26:00 - INFO - __main__ - Step: 550 Loss: 10.4899 LR: 0.000300 +12/09/2025 14:26:08 - INFO - __main__ - Step: 560 Loss: 10.5921 LR: 0.000300 +12/09/2025 14:26:16 - INFO - __main__ - Step: 570 Loss: 10.5204 LR: 0.000300 +12/09/2025 14:26:24 - INFO - __main__ - Step: 580 Loss: 10.5032 LR: 0.000300 +12/09/2025 14:26:32 - INFO - __main__ - Step: 590 Loss: 10.5714 LR: 0.000300 +12/09/2025 14:26:41 - INFO - __main__ - Step: 600 Loss: 10.5378 LR: 0.000300 +12/09/2025 14:26:49 - INFO - __main__ - Step: 610 Loss: 10.4954 LR: 0.000300 +12/09/2025 14:26:57 - INFO - __main__ - Step: 620 Loss: 10.4548 LR: 0.000300 +12/09/2025 14:27:05 - INFO - __main__ - Step: 630 Loss: 10.4652 LR: 0.000300 +12/09/2025 14:27:14 - INFO - __main__ - Step: 640 Loss: 10.5761 LR: 0.000300 +12/09/2025 14:27:22 - INFO - __main__ - Step: 650 Loss: 10.4299 LR: 0.000300 +12/09/2025 14:27:30 - INFO - __main__ - Step: 660 Loss: 10.6078 LR: 0.000300 +12/09/2025 14:27:39 - INFO - __main__ - Step: 670 Loss: 10.3975 LR: 0.000300 +12/09/2025 14:27:48 - INFO - __main__ - Step: 680 Loss: 10.5504 LR: 0.000300 +12/09/2025 14:27:56 - INFO - __main__ - Step: 690 Loss: 10.5034 LR: 0.000300 +12/09/2025 14:28:04 - INFO - __main__ - Step: 700 Loss: 10.5199 LR: 0.000300 +12/09/2025 14:28:12 - INFO - __main__ - Step: 710 Loss: 10.4411 LR: 0.000300 +12/09/2025 14:28:21 - INFO - __main__ - Step: 720 Loss: 10.5867 LR: 0.000300 +12/09/2025 14:28:30 - INFO - __main__ - Step: 730 Loss: 10.5285 LR: 0.000300 +12/09/2025 14:28:38 - INFO - __main__ - Step: 740 Loss: 10.5524 LR: 0.000300 +12/09/2025 14:28:47 - INFO - __main__ - Step: 750 Loss: 10.4680 LR: 0.000300 +12/09/2025 14:28:55 - INFO - __main__ - Step: 760 Loss: 10.5404 LR: 0.000300 +12/09/2025 14:29:04 - INFO - __main__ - Step: 770 Loss: 10.4509 LR: 0.000300 +12/09/2025 14:29:12 - INFO - __main__ - Step: 780 Loss: 10.5221 LR: 0.000300 +12/09/2025 14:29:21 - INFO - __main__ - Step: 790 Loss: 10.4907 LR: 0.000300 +12/09/2025 14:29:29 - INFO - __main__ - Step: 800 Loss: 10.3947 LR: 0.000300 +12/09/2025 14:29:38 - INFO - __main__ - Step: 810 Loss: 10.5406 LR: 0.000300 +12/09/2025 14:29:46 - INFO - __main__ - Step: 820 Loss: 10.4586 LR: 0.000300 +12/09/2025 14:29:55 - INFO - __main__ - Step: 830 Loss: 10.5223 LR: 0.000300 +12/09/2025 14:30:04 - INFO - __main__ - Step: 840 Loss: 10.4387 LR: 0.000300 +12/09/2025 14:30:12 - INFO - __main__ - Step: 850 Loss: 10.5751 LR: 0.000300 +12/09/2025 14:30:20 - INFO - __main__ - Step: 860 Loss: 10.4888 LR: 0.000300 +12/09/2025 14:30:28 - INFO - __main__ - Step: 870 Loss: 10.5258 LR: 0.000300 +12/09/2025 14:30:37 - INFO - __main__ - Step: 880 Loss: 10.3787 LR: 0.000300 +12/09/2025 14:30:45 - INFO - __main__ - Step: 890 Loss: 10.3855 LR: 0.000300 +12/09/2025 14:30:53 - INFO - __main__ - Step: 900 Loss: 10.3843 LR: 0.000300 +12/09/2025 14:31:02 - INFO - __main__ - Step: 910 Loss: 10.4360 LR: 0.000300 +12/09/2025 14:31:10 - INFO - __main__ - Step: 920 Loss: 10.5285 LR: 0.000300 +12/09/2025 14:31:18 - INFO - __main__ - Step: 930 Loss: 10.4288 LR: 0.000300 +12/09/2025 14:31:27 - INFO - __main__ - Step: 940 Loss: 10.5211 LR: 0.000300 +12/09/2025 14:31:36 - INFO - __main__ - Step: 950 Loss: 10.4077 LR: 0.000300 +12/09/2025 14:31:45 - INFO - __main__ - Step: 960 Loss: 10.3606 LR: 0.000300 +12/09/2025 14:31:53 - INFO - __main__ - Step: 970 Loss: 10.3525 LR: 0.000300 +12/09/2025 14:32:02 - INFO - __main__ - Step: 980 Loss: 10.4681 LR: 0.000300 +12/09/2025 14:32:10 - INFO - __main__ - Step: 990 Loss: 10.4996 LR: 0.000300 +12/09/2025 14:32:19 - INFO - __main__ - Step: 1000 Loss: 10.4602 LR: 0.000300 +12/09/2025 14:32:19 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000 +12/09/2025 14:32:26 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/optimizer.bin +12/09/2025 14:32:26 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/scheduler.bin +12/09/2025 14:32:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/sampler.bin +12/09/2025 14:32:26 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000/random_states_0.pkl +12/09/2025 14:32:26 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1000 +12/09/2025 14:32:26 - INFO - __main__ - Generating videos for validation... +12/09/2025 14:32:26 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.77it/s] +12/09/2025 14:32:34 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 14:32:42 - INFO - __main__ - Step: 1010 Loss: 10.5081 LR: 0.000300 +12/09/2025 14:32:50 - INFO - __main__ - Step: 1020 Loss: 10.4993 LR: 0.000300 +12/09/2025 14:32:58 - INFO - __main__ - Step: 1030 Loss: 10.3718 LR: 0.000300 +12/09/2025 14:33:07 - INFO - __main__ - Step: 1040 Loss: 10.5117 LR: 0.000300 +12/09/2025 14:33:15 - INFO - __main__ - Step: 1050 Loss: 10.3794 LR: 0.000300 +12/09/2025 14:33:23 - INFO - __main__ - Step: 1060 Loss: 10.5579 LR: 0.000300 +12/09/2025 14:33:32 - INFO - __main__ - Step: 1070 Loss: 10.4656 LR: 0.000300 +12/09/2025 14:33:40 - INFO - __main__ - Step: 1080 Loss: 10.5221 LR: 0.000300 +12/09/2025 14:33:48 - INFO - __main__ - Step: 1090 Loss: 10.4562 LR: 0.000300 +12/09/2025 14:33:56 - INFO - __main__ - Step: 1100 Loss: 10.4584 LR: 0.000300 +12/09/2025 14:34:03 - INFO - __main__ - Step: 1110 Loss: 10.4657 LR: 0.000300 +12/09/2025 14:34:11 - INFO - __main__ - Step: 1120 Loss: 10.3170 LR: 0.000300 +12/09/2025 14:34:20 - INFO - __main__ - Step: 1130 Loss: 10.4157 LR: 0.000300 +12/09/2025 14:34:28 - INFO - __main__ - Step: 1140 Loss: 10.5589 LR: 0.000300 +12/09/2025 14:34:36 - INFO - __main__ - Step: 1150 Loss: 10.4533 LR: 0.000300 +12/09/2025 14:34:45 - INFO - __main__ - Step: 1160 Loss: 10.4369 LR: 0.000300 +12/09/2025 14:34:53 - INFO - __main__ - Step: 1170 Loss: 10.3897 LR: 0.000300 +12/09/2025 14:35:01 - INFO - __main__ - Step: 1180 Loss: 10.4845 LR: 0.000300 +12/09/2025 14:35:09 - INFO - __main__ - Step: 1190 Loss: 10.3569 LR: 0.000300 +12/09/2025 14:35:17 - INFO - __main__ - Step: 1200 Loss: 10.4201 LR: 0.000300 +12/09/2025 14:35:25 - INFO - __main__ - Step: 1210 Loss: 10.3938 LR: 0.000300 +12/09/2025 14:35:34 - INFO - __main__ - Step: 1220 Loss: 10.3758 LR: 0.000300 +12/09/2025 14:35:43 - INFO - __main__ - Step: 1230 Loss: 10.5326 LR: 0.000300 +12/09/2025 14:35:51 - INFO - __main__ - Step: 1240 Loss: 10.3612 LR: 0.000300 +12/09/2025 14:36:00 - INFO - __main__ - Step: 1250 Loss: 10.4457 LR: 0.000300 +12/09/2025 14:36:08 - INFO - __main__ - Step: 1260 Loss: 10.4828 LR: 0.000300 +12/09/2025 14:36:16 - INFO - __main__ - Step: 1270 Loss: 10.4687 LR: 0.000300 +12/09/2025 14:36:25 - INFO - __main__ - Step: 1280 Loss: 10.3611 LR: 0.000300 +12/09/2025 14:36:33 - INFO - __main__ - Step: 1290 Loss: 10.4610 LR: 0.000300 +12/09/2025 14:36:42 - INFO - __main__ - Step: 1300 Loss: 10.4953 LR: 0.000300 +12/09/2025 14:36:50 - INFO - __main__ - Step: 1310 Loss: 10.4658 LR: 0.000300 +12/09/2025 14:36:59 - INFO - __main__ - Step: 1320 Loss: 10.4092 LR: 0.000300 +12/09/2025 14:37:07 - INFO - __main__ - Step: 1330 Loss: 10.5397 LR: 0.000300 +12/09/2025 14:37:15 - INFO - __main__ - Step: 1340 Loss: 10.4840 LR: 0.000300 +12/09/2025 14:37:23 - INFO - __main__ - Step: 1350 Loss: 10.4958 LR: 0.000300 +12/09/2025 14:37:32 - INFO - __main__ - Step: 1360 Loss: 10.5926 LR: 0.000300 +12/09/2025 14:37:40 - INFO - __main__ - Step: 1370 Loss: 10.4392 LR: 0.000300 +12/09/2025 14:37:49 - INFO - __main__ - Step: 1380 Loss: 10.5155 LR: 0.000300 +12/09/2025 14:37:57 - INFO - __main__ - Step: 1390 Loss: 10.4504 LR: 0.000300 +12/09/2025 14:38:06 - INFO - __main__ - Step: 1400 Loss: 10.5094 LR: 0.000300 +12/09/2025 14:38:14 - INFO - __main__ - Step: 1410 Loss: 10.4910 LR: 0.000300 +12/09/2025 14:38:22 - INFO - __main__ - Step: 1420 Loss: 10.4078 LR: 0.000300 +12/09/2025 14:38:30 - INFO - __main__ - Step: 1430 Loss: 10.4891 LR: 0.000300 +12/09/2025 14:38:39 - INFO - __main__ - Step: 1440 Loss: 10.4848 LR: 0.000300 +12/09/2025 14:38:47 - INFO - __main__ - Step: 1450 Loss: 10.3720 LR: 0.000300 +12/09/2025 14:38:55 - INFO - __main__ - Step: 1460 Loss: 10.5407 LR: 0.000300 +12/09/2025 14:39:04 - INFO - __main__ - Step: 1470 Loss: 10.4086 LR: 0.000300 +12/09/2025 14:39:12 - INFO - __main__ - Step: 1480 Loss: 10.4516 LR: 0.000300 +12/09/2025 14:39:20 - INFO - __main__ - Step: 1490 Loss: 10.4011 LR: 0.000300 +12/09/2025 14:39:28 - INFO - __main__ - Step: 1500 Loss: 10.4982 LR: 0.000300 +12/09/2025 14:39:28 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500 +12/09/2025 14:39:35 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/optimizer.bin +12/09/2025 14:39:35 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/scheduler.bin +12/09/2025 14:39:35 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/sampler.bin +12/09/2025 14:39:35 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500/random_states_0.pkl +12/09/2025 14:39:35 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-1500 +12/09/2025 14:39:35 - INFO - __main__ - Generating videos for validation... +12/09/2025 14:39:35 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.62it/s] +12/09/2025 14:39:42 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 14:39:51 - INFO - __main__ - Step: 1510 Loss: 10.3688 LR: 0.000300 +12/09/2025 14:39:59 - INFO - __main__ - Step: 1520 Loss: 10.4142 LR: 0.000300 +12/09/2025 14:40:08 - INFO - __main__ - Step: 1530 Loss: 10.4045 LR: 0.000300 +12/09/2025 14:40:16 - INFO - __main__ - Step: 1540 Loss: 10.4536 LR: 0.000300 +12/09/2025 14:40:24 - INFO - __main__ - Step: 1550 Loss: 10.4827 LR: 0.000300 +12/09/2025 14:40:33 - INFO - __main__ - Step: 1560 Loss: 10.4726 LR: 0.000300 +12/09/2025 14:40:41 - INFO - __main__ - Step: 1570 Loss: 10.4621 LR: 0.000300 +12/09/2025 14:40:50 - INFO - __main__ - Step: 1580 Loss: 10.4076 LR: 0.000300 +12/09/2025 14:40:57 - INFO - __main__ - Step: 1590 Loss: 10.3771 LR: 0.000300 +12/09/2025 14:41:05 - INFO - __main__ - Step: 1600 Loss: 10.4990 LR: 0.000300 +12/09/2025 14:41:13 - INFO - __main__ - Step: 1610 Loss: 10.4034 LR: 0.000300 +12/09/2025 14:41:21 - INFO - __main__ - Step: 1620 Loss: 10.4673 LR: 0.000300 +12/09/2025 14:41:30 - INFO - __main__ - Step: 1630 Loss: 10.5685 LR: 0.000300 +12/09/2025 14:41:38 - INFO - __main__ - Step: 1640 Loss: 10.2373 LR: 0.000300 +12/09/2025 14:41:47 - INFO - __main__ - Step: 1650 Loss: 10.3953 LR: 0.000300 +12/09/2025 14:41:54 - INFO - __main__ - Step: 1660 Loss: 10.4320 LR: 0.000300 +12/09/2025 14:42:02 - INFO - __main__ - Step: 1670 Loss: 10.3989 LR: 0.000300 +12/09/2025 14:42:10 - INFO - __main__ - Step: 1680 Loss: 10.3984 LR: 0.000300 +12/09/2025 14:42:18 - INFO - __main__ - Step: 1690 Loss: 10.3753 LR: 0.000300 +12/09/2025 14:42:26 - INFO - __main__ - Step: 1700 Loss: 10.3394 LR: 0.000300 +12/09/2025 14:42:34 - INFO - __main__ - Step: 1710 Loss: 10.3646 LR: 0.000300 +12/09/2025 14:42:43 - INFO - __main__ - Step: 1720 Loss: 10.4932 LR: 0.000300 +12/09/2025 14:42:50 - INFO - __main__ - Step: 1730 Loss: 10.2850 LR: 0.000300 +12/09/2025 14:42:58 - INFO - __main__ - Step: 1740 Loss: 10.4153 LR: 0.000300 +12/09/2025 14:43:05 - INFO - __main__ - Step: 1750 Loss: 10.4864 LR: 0.000300 +12/09/2025 14:43:13 - INFO - __main__ - Step: 1760 Loss: 10.3920 LR: 0.000300 +12/09/2025 14:43:21 - INFO - __main__ - Step: 1770 Loss: 10.4139 LR: 0.000300 +12/09/2025 14:43:29 - INFO - __main__ - Step: 1780 Loss: 10.4104 LR: 0.000300 +12/09/2025 14:43:37 - INFO - __main__ - Step: 1790 Loss: 10.3351 LR: 0.000300 +12/09/2025 14:43:45 - INFO - __main__ - Step: 1800 Loss: 10.4453 LR: 0.000300 +12/09/2025 14:43:53 - INFO - __main__ - Step: 1810 Loss: 10.4127 LR: 0.000300 +12/09/2025 14:44:01 - INFO - __main__ - Step: 1820 Loss: 10.4920 LR: 0.000300 +12/09/2025 14:44:09 - INFO - __main__ - Step: 1830 Loss: 10.4515 LR: 0.000300 +12/09/2025 14:44:16 - INFO - __main__ - Step: 1840 Loss: 10.4176 LR: 0.000300 +12/09/2025 14:44:24 - INFO - __main__ - Step: 1850 Loss: 10.4255 LR: 0.000300 +12/09/2025 14:44:32 - INFO - __main__ - Step: 1860 Loss: 10.3986 LR: 0.000300 +12/09/2025 14:44:40 - INFO - __main__ - Step: 1870 Loss: 10.3737 LR: 0.000300 +12/09/2025 14:44:47 - INFO - __main__ - Step: 1880 Loss: 10.4050 LR: 0.000300 +12/09/2025 14:44:55 - INFO - __main__ - Step: 1890 Loss: 10.3659 LR: 0.000300 +12/09/2025 14:45:02 - INFO - __main__ - Step: 1900 Loss: 10.3975 LR: 0.000300 +12/09/2025 14:45:10 - INFO - __main__ - Step: 1910 Loss: 10.4211 LR: 0.000300 +12/09/2025 14:45:19 - INFO - __main__ - Step: 1920 Loss: 10.3395 LR: 0.000300 +12/09/2025 14:45:27 - INFO - __main__ - Step: 1930 Loss: 10.4324 LR: 0.000300 +12/09/2025 14:45:35 - INFO - __main__ - Step: 1940 Loss: 10.5370 LR: 0.000300 +12/09/2025 14:45:43 - INFO - __main__ - Step: 1950 Loss: 10.4142 LR: 0.000300 +12/09/2025 14:45:51 - INFO - __main__ - Step: 1960 Loss: 10.3945 LR: 0.000300 +12/09/2025 14:45:58 - INFO - __main__ - Step: 1970 Loss: 10.4885 LR: 0.000300 +12/09/2025 14:46:07 - INFO - __main__ - Step: 1980 Loss: 10.4398 LR: 0.000300 +12/09/2025 14:46:15 - INFO - __main__ - Step: 1990 Loss: 10.3821 LR: 0.000300 +12/09/2025 14:46:23 - INFO - __main__ - Step: 2000 Loss: 10.4714 LR: 0.000300 +12/09/2025 14:46:23 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000 +12/09/2025 14:46:31 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/optimizer.bin +12/09/2025 14:46:31 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/scheduler.bin +12/09/2025 14:46:31 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/sampler.bin +12/09/2025 14:46:31 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000/random_states_0.pkl +12/09/2025 14:46:31 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2000 +12/09/2025 14:46:31 - INFO - __main__ - Generating videos for validation... +12/09/2025 14:46:31 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.74it/s] +12/09/2025 14:46:37 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 14:46:46 - INFO - __main__ - Step: 2010 Loss: 10.4913 LR: 0.000300 +12/09/2025 14:46:54 - INFO - __main__ - Step: 2020 Loss: 10.4892 LR: 0.000300 +12/09/2025 14:47:02 - INFO - __main__ - Step: 2030 Loss: 10.5157 LR: 0.000300 +12/09/2025 14:47:11 - INFO - __main__ - Step: 2040 Loss: 10.4690 LR: 0.000300 +12/09/2025 14:47:18 - INFO - __main__ - Step: 2050 Loss: 10.4872 LR: 0.000300 +12/09/2025 14:47:26 - INFO - __main__ - Step: 2060 Loss: 10.4257 LR: 0.000300 +12/09/2025 14:47:35 - INFO - __main__ - Step: 2070 Loss: 10.4276 LR: 0.000300 +12/09/2025 14:47:44 - INFO - __main__ - Step: 2080 Loss: 10.3805 LR: 0.000300 +12/09/2025 14:47:52 - INFO - __main__ - Step: 2090 Loss: 10.5010 LR: 0.000300 +12/09/2025 14:48:00 - INFO - __main__ - Step: 2100 Loss: 10.4214 LR: 0.000300 +12/09/2025 14:48:09 - INFO - __main__ - Step: 2110 Loss: 10.4174 LR: 0.000300 +12/09/2025 14:48:16 - INFO - __main__ - Step: 2120 Loss: 10.4200 LR: 0.000300 +12/09/2025 14:48:24 - INFO - __main__ - Step: 2130 Loss: 10.3379 LR: 0.000300 +12/09/2025 14:48:32 - INFO - __main__ - Step: 2140 Loss: 10.4045 LR: 0.000300 +12/09/2025 14:48:41 - INFO - __main__ - Step: 2150 Loss: 10.4128 LR: 0.000300 +12/09/2025 14:48:49 - INFO - __main__ - Step: 2160 Loss: 10.3600 LR: 0.000300 +12/09/2025 14:48:58 - INFO - __main__ - Step: 2170 Loss: 10.4262 LR: 0.000300 +12/09/2025 14:49:06 - INFO - __main__ - Step: 2180 Loss: 10.3869 LR: 0.000300 +12/09/2025 14:49:15 - INFO - __main__ - Step: 2190 Loss: 10.3994 LR: 0.000300 +12/09/2025 14:49:23 - INFO - __main__ - Step: 2200 Loss: 10.3750 LR: 0.000300 +12/09/2025 14:49:31 - INFO - __main__ - Step: 2210 Loss: 10.4021 LR: 0.000300 +12/09/2025 14:49:40 - INFO - __main__ - Step: 2220 Loss: 10.3652 LR: 0.000300 +12/09/2025 14:49:49 - INFO - __main__ - Step: 2230 Loss: 10.5028 LR: 0.000300 +12/09/2025 14:49:58 - INFO - __main__ - Step: 2240 Loss: 10.4137 LR: 0.000300 +12/09/2025 14:50:07 - INFO - __main__ - Step: 2250 Loss: 10.4091 LR: 0.000300 +12/09/2025 14:50:16 - INFO - __main__ - Step: 2260 Loss: 10.5426 LR: 0.000300 +12/09/2025 14:50:24 - INFO - __main__ - Step: 2270 Loss: 10.4749 LR: 0.000300 +12/09/2025 14:50:32 - INFO - __main__ - Step: 2280 Loss: 10.4904 LR: 0.000300 +12/09/2025 14:50:40 - INFO - __main__ - Step: 2290 Loss: 10.3283 LR: 0.000300 +12/09/2025 14:50:49 - INFO - __main__ - Step: 2300 Loss: 10.4035 LR: 0.000300 +12/09/2025 14:50:57 - INFO - __main__ - Step: 2310 Loss: 10.4176 LR: 0.000300 +12/09/2025 14:51:06 - INFO - __main__ - Step: 2320 Loss: 10.5085 LR: 0.000300 +12/09/2025 14:51:15 - INFO - __main__ - Step: 2330 Loss: 10.3436 LR: 0.000300 +12/09/2025 14:51:23 - INFO - __main__ - Step: 2340 Loss: 10.4886 LR: 0.000300 +12/09/2025 14:51:32 - INFO - __main__ - Step: 2350 Loss: 10.4517 LR: 0.000300 +12/09/2025 14:51:40 - INFO - __main__ - Step: 2360 Loss: 10.3703 LR: 0.000300 +12/09/2025 14:51:47 - INFO - __main__ - Step: 2370 Loss: 10.4542 LR: 0.000300 +12/09/2025 14:51:56 - INFO - __main__ - Step: 2380 Loss: 10.2895 LR: 0.000300 +12/09/2025 14:52:04 - INFO - __main__ - Step: 2390 Loss: 10.3339 LR: 0.000300 +12/09/2025 14:52:13 - INFO - __main__ - Step: 2400 Loss: 10.3490 LR: 0.000300 +12/09/2025 14:52:21 - INFO - __main__ - Step: 2410 Loss: 10.4567 LR: 0.000300 +12/09/2025 14:52:29 - INFO - __main__ - Step: 2420 Loss: 10.5396 LR: 0.000300 +12/09/2025 14:52:37 - INFO - __main__ - Step: 2430 Loss: 10.5307 LR: 0.000300 +12/09/2025 14:52:45 - INFO - __main__ - Step: 2440 Loss: 10.4733 LR: 0.000300 +12/09/2025 14:52:53 - INFO - __main__ - Step: 2450 Loss: 10.3741 LR: 0.000300 +12/09/2025 14:53:01 - INFO - __main__ - Step: 2460 Loss: 10.4488 LR: 0.000300 +12/09/2025 14:53:10 - INFO - __main__ - Step: 2470 Loss: 10.4334 LR: 0.000300 +12/09/2025 14:53:18 - INFO - __main__ - Step: 2480 Loss: 10.4863 LR: 0.000300 +12/09/2025 14:53:26 - INFO - __main__ - Step: 2490 Loss: 10.4204 LR: 0.000300 +12/09/2025 14:53:34 - INFO - __main__ - Step: 2500 Loss: 10.4145 LR: 0.000300 +12/09/2025 14:53:34 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500 +12/09/2025 14:53:42 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/optimizer.bin +12/09/2025 14:53:42 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/scheduler.bin +12/09/2025 14:53:42 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/sampler.bin +12/09/2025 14:53:42 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500/random_states_0.pkl +12/09/2025 14:53:42 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-2500 +12/09/2025 14:53:42 - INFO - __main__ - Generating videos for validation... +12/09/2025 14:53:42 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.74it/s] +12/09/2025 14:53:49 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 14:53:56 - INFO - __main__ - Step: 2510 Loss: 10.4580 LR: 0.000300 +12/09/2025 14:54:04 - INFO - __main__ - Step: 2520 Loss: 10.4518 LR: 0.000300 +12/09/2025 14:54:12 - INFO - __main__ - Step: 2530 Loss: 10.5546 LR: 0.000300 +12/09/2025 14:54:20 - INFO - __main__ - Step: 2540 Loss: 10.4131 LR: 0.000300 +12/09/2025 14:54:29 - INFO - __main__ - Step: 2550 Loss: 10.3408 LR: 0.000300 +12/09/2025 14:54:37 - INFO - __main__ - Step: 2560 Loss: 10.3797 LR: 0.000300 +12/09/2025 14:54:45 - INFO - __main__ - Step: 2570 Loss: 10.4836 LR: 0.000300 +12/09/2025 14:54:53 - INFO - __main__ - Step: 2580 Loss: 10.2595 LR: 0.000300 +12/09/2025 14:55:00 - INFO - __main__ - Step: 2590 Loss: 10.3303 LR: 0.000300 +12/09/2025 14:55:07 - INFO - __main__ - Step: 2600 Loss: 10.3411 LR: 0.000300 +12/09/2025 14:55:16 - INFO - __main__ - Step: 2610 Loss: 10.4018 LR: 0.000300 +12/09/2025 14:55:24 - INFO - __main__ - Step: 2620 Loss: 10.4771 LR: 0.000300 +12/09/2025 14:55:32 - INFO - __main__ - Step: 2630 Loss: 10.3311 LR: 0.000300 +12/09/2025 14:55:40 - INFO - __main__ - Step: 2640 Loss: 10.3900 LR: 0.000300 +12/09/2025 14:55:49 - INFO - __main__ - Step: 2650 Loss: 10.3524 LR: 0.000300 +12/09/2025 14:55:57 - INFO - __main__ - Step: 2660 Loss: 10.3557 LR: 0.000300 +12/09/2025 14:56:05 - INFO - __main__ - Step: 2670 Loss: 10.3889 LR: 0.000300 +12/09/2025 14:56:13 - INFO - __main__ - Step: 2680 Loss: 10.3013 LR: 0.000300 +12/09/2025 14:56:21 - INFO - __main__ - Step: 2690 Loss: 10.4000 LR: 0.000300 +12/09/2025 14:56:29 - INFO - __main__ - Step: 2700 Loss: 10.4197 LR: 0.000300 +12/09/2025 14:56:37 - INFO - __main__ - Step: 2710 Loss: 10.4226 LR: 0.000300 +12/09/2025 14:56:46 - INFO - __main__ - Step: 2720 Loss: 10.3753 LR: 0.000300 +12/09/2025 14:56:54 - INFO - __main__ - Step: 2730 Loss: 10.4958 LR: 0.000300 +12/09/2025 14:57:02 - INFO - __main__ - Step: 2740 Loss: 10.3689 LR: 0.000300 +12/09/2025 14:57:10 - INFO - __main__ - Step: 2750 Loss: 10.4282 LR: 0.000300 +12/09/2025 14:57:18 - INFO - __main__ - Step: 2760 Loss: 10.4701 LR: 0.000300 +12/09/2025 14:57:26 - INFO - __main__ - Step: 2770 Loss: 10.4405 LR: 0.000300 +12/09/2025 14:57:35 - INFO - __main__ - Step: 2780 Loss: 10.4365 LR: 0.000300 +12/09/2025 14:57:43 - INFO - __main__ - Step: 2790 Loss: 10.4165 LR: 0.000300 +12/09/2025 14:57:52 - INFO - __main__ - Step: 2800 Loss: 10.4600 LR: 0.000300 +12/09/2025 14:58:00 - INFO - __main__ - Step: 2810 Loss: 10.3449 LR: 0.000300 +12/09/2025 14:58:08 - INFO - __main__ - Step: 2820 Loss: 10.3529 LR: 0.000300 +12/09/2025 14:58:16 - INFO - __main__ - Step: 2830 Loss: 10.3326 LR: 0.000300 +12/09/2025 14:58:25 - INFO - __main__ - Step: 2840 Loss: 10.5020 LR: 0.000300 +12/09/2025 14:58:33 - INFO - __main__ - Step: 2850 Loss: 10.4364 LR: 0.000300 +12/09/2025 14:58:41 - INFO - __main__ - Step: 2860 Loss: 10.3497 LR: 0.000300 +12/09/2025 14:58:49 - INFO - __main__ - Step: 2870 Loss: 10.4029 LR: 0.000300 +12/09/2025 14:58:58 - INFO - __main__ - Step: 2880 Loss: 10.3246 LR: 0.000300 +12/09/2025 14:59:06 - INFO - __main__ - Step: 2890 Loss: 10.4158 LR: 0.000300 +12/09/2025 14:59:14 - INFO - __main__ - Step: 2900 Loss: 10.3602 LR: 0.000300 +12/09/2025 14:59:22 - INFO - __main__ - Step: 2910 Loss: 10.3202 LR: 0.000300 +12/09/2025 14:59:30 - INFO - __main__ - Step: 2920 Loss: 10.4786 LR: 0.000300 +12/09/2025 14:59:39 - INFO - __main__ - Step: 2930 Loss: 10.4029 LR: 0.000300 +12/09/2025 14:59:48 - INFO - __main__ - Step: 2940 Loss: 10.4191 LR: 0.000300 +12/09/2025 14:59:57 - INFO - __main__ - Step: 2950 Loss: 10.4091 LR: 0.000300 +12/09/2025 15:00:06 - INFO - __main__ - Step: 2960 Loss: 10.5146 LR: 0.000300 +12/09/2025 15:00:14 - INFO - __main__ - Step: 2970 Loss: 10.3983 LR: 0.000300 +12/09/2025 15:00:22 - INFO - __main__ - Step: 2980 Loss: 10.3034 LR: 0.000300 +12/09/2025 15:00:30 - INFO - __main__ - Step: 2990 Loss: 10.3820 LR: 0.000300 +12/09/2025 15:00:39 - INFO - __main__ - Step: 3000 Loss: 10.4916 LR: 0.000300 +12/09/2025 15:00:39 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000 +12/09/2025 15:00:46 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/optimizer.bin +12/09/2025 15:00:46 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/scheduler.bin +12/09/2025 15:00:46 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/sampler.bin +12/09/2025 15:00:46 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000/random_states_0.pkl +12/09/2025 15:00:46 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3000 +12/09/2025 15:00:46 - INFO - __main__ - Generating videos for validation... +12/09/2025 15:00:46 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.76it/s] +12/09/2025 15:00:53 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 15:01:01 - INFO - __main__ - Step: 3010 Loss: 10.4128 LR: 0.000300 +12/09/2025 15:01:10 - INFO - __main__ - Step: 3020 Loss: 10.4473 LR: 0.000300 +12/09/2025 15:01:18 - INFO - __main__ - Step: 3030 Loss: 10.5300 LR: 0.000300 +12/09/2025 15:01:27 - INFO - __main__ - Step: 3040 Loss: 10.4155 LR: 0.000300 +12/09/2025 15:01:34 - INFO - __main__ - Step: 3050 Loss: 10.4354 LR: 0.000300 +12/09/2025 15:01:43 - INFO - __main__ - Step: 3060 Loss: 10.3338 LR: 0.000300 +12/09/2025 15:01:51 - INFO - __main__ - Step: 3070 Loss: 10.4649 LR: 0.000300 +12/09/2025 15:01:59 - INFO - __main__ - Step: 3080 Loss: 10.4024 LR: 0.000300 +12/09/2025 15:02:07 - INFO - __main__ - Step: 3090 Loss: 10.4597 LR: 0.000300 +12/09/2025 15:02:16 - INFO - __main__ - Step: 3100 Loss: 10.4229 LR: 0.000300 +12/09/2025 15:02:25 - INFO - __main__ - Step: 3110 Loss: 10.2890 LR: 0.000300 +12/09/2025 15:02:33 - INFO - __main__ - Step: 3120 Loss: 10.3721 LR: 0.000300 +12/09/2025 15:02:41 - INFO - __main__ - Step: 3130 Loss: 10.4633 LR: 0.000300 +12/09/2025 15:02:49 - INFO - __main__ - Step: 3140 Loss: 10.2339 LR: 0.000300 +12/09/2025 15:02:57 - INFO - __main__ - Step: 3150 Loss: 10.3505 LR: 0.000300 +12/09/2025 15:03:05 - INFO - __main__ - Step: 3160 Loss: 10.3656 LR: 0.000300 +12/09/2025 15:03:13 - INFO - __main__ - Step: 3170 Loss: 10.3447 LR: 0.000300 +12/09/2025 15:03:22 - INFO - __main__ - Step: 3180 Loss: 10.3553 LR: 0.000300 +12/09/2025 15:03:31 - INFO - __main__ - Step: 3190 Loss: 10.4069 LR: 0.000300 +12/09/2025 15:03:39 - INFO - __main__ - Step: 3200 Loss: 10.4120 LR: 0.000300 +12/09/2025 15:03:48 - INFO - __main__ - Step: 3210 Loss: 10.3601 LR: 0.000300 +12/09/2025 15:03:56 - INFO - __main__ - Step: 3220 Loss: 10.3693 LR: 0.000300 +12/09/2025 15:04:04 - INFO - __main__ - Step: 3230 Loss: 10.3294 LR: 0.000300 +12/09/2025 15:04:13 - INFO - __main__ - Step: 3240 Loss: 10.3824 LR: 0.000300 +12/09/2025 15:04:22 - INFO - __main__ - Step: 3250 Loss: 10.4374 LR: 0.000300 +12/09/2025 15:04:30 - INFO - __main__ - Step: 3260 Loss: 10.4087 LR: 0.000300 +12/09/2025 15:04:39 - INFO - __main__ - Step: 3270 Loss: 10.4182 LR: 0.000300 +12/09/2025 15:04:47 - INFO - __main__ - Step: 3280 Loss: 10.3291 LR: 0.000300 +12/09/2025 15:04:56 - INFO - __main__ - Step: 3290 Loss: 10.3474 LR: 0.000300 +12/09/2025 15:05:03 - INFO - __main__ - Step: 3300 Loss: 10.3270 LR: 0.000300 +12/09/2025 15:05:12 - INFO - __main__ - Step: 3310 Loss: 10.4398 LR: 0.000300 +12/09/2025 15:05:20 - INFO - __main__ - Step: 3320 Loss: 10.3624 LR: 0.000300 +12/09/2025 15:05:29 - INFO - __main__ - Step: 3330 Loss: 10.4954 LR: 0.000300 +12/09/2025 15:05:37 - INFO - __main__ - Step: 3340 Loss: 10.4612 LR: 0.000300 +12/09/2025 15:05:45 - INFO - __main__ - Step: 3350 Loss: 10.3814 LR: 0.000300 +12/09/2025 15:05:54 - INFO - __main__ - Step: 3360 Loss: 10.3554 LR: 0.000300 +12/09/2025 15:06:01 - INFO - __main__ - Step: 3370 Loss: 10.4198 LR: 0.000300 +12/09/2025 15:06:09 - INFO - __main__ - Step: 3380 Loss: 10.4446 LR: 0.000300 +12/09/2025 15:06:18 - INFO - __main__ - Step: 3390 Loss: 10.4236 LR: 0.000300 +12/09/2025 15:06:26 - INFO - __main__ - Step: 3400 Loss: 10.4261 LR: 0.000300 +12/09/2025 15:06:35 - INFO - __main__ - Step: 3410 Loss: 10.4843 LR: 0.000300 +12/09/2025 15:06:43 - INFO - __main__ - Step: 3420 Loss: 10.3067 LR: 0.000300 +12/09/2025 15:06:51 - INFO - __main__ - Step: 3430 Loss: 10.2871 LR: 0.000300 +12/09/2025 15:06:59 - INFO - __main__ - Step: 3440 Loss: 10.4699 LR: 0.000300 +12/09/2025 15:07:07 - INFO - __main__ - Step: 3450 Loss: 10.5213 LR: 0.000300 +12/09/2025 15:07:15 - INFO - __main__ - Step: 3460 Loss: 10.4694 LR: 0.000300 +12/09/2025 15:07:24 - INFO - __main__ - Step: 3470 Loss: 10.3963 LR: 0.000300 +12/09/2025 15:07:32 - INFO - __main__ - Step: 3480 Loss: 10.4730 LR: 0.000300 +12/09/2025 15:07:41 - INFO - __main__ - Step: 3490 Loss: 10.4303 LR: 0.000300 +12/09/2025 15:07:49 - INFO - __main__ - Step: 3500 Loss: 10.3538 LR: 0.000300 +12/09/2025 15:07:49 - INFO - accelerate.accelerator - Saving current state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500 +12/09/2025 15:07:57 - INFO - accelerate.checkpointing - Optimizer state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/optimizer.bin +12/09/2025 15:07:57 - INFO - accelerate.checkpointing - Scheduler state saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/scheduler.bin +12/09/2025 15:07:57 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/sampler.bin +12/09/2025 15:07:57 - INFO - accelerate.checkpointing - Random states saved in output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500/random_states_0.pkl +12/09/2025 15:07:57 - INFO - __main__ - Saved state to output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp/checkpoint-3500 +12/09/2025 15:07:57 - INFO - __main__ - Generating videos for validation... +12/09/2025 15:07:57 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:06<00:00, 7.62it/s] +12/09/2025 15:08:03 - INFO - __main__ - Validation videos saved to ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +12/09/2025 15:08:11 - INFO - __main__ - Step: 3510 Loss: 10.3953 LR: 0.000300 +12/09/2025 15:08:19 - INFO - __main__ - Step: 3520 Loss: 10.3537 LR: 0.000300 +12/09/2025 15:08:27 - INFO - __main__ - Step: 3530 Loss: 10.5191 LR: 0.000300 +12/09/2025 15:08:35 - INFO - __main__ - Step: 3540 Loss: 10.3530 LR: 0.000300 +12/09/2025 15:08:43 - INFO - __main__ - Step: 3550 Loss: 10.3432 LR: 0.000300 +12/09/2025 15:08:52 - INFO - __main__ - Step: 3560 Loss: 10.4816 LR: 0.000300 +12/09/2025 15:09:01 - INFO - __main__ - Step: 3570 Loss: 10.4883 LR: 0.000300 +12/09/2025 15:09:09 - INFO - __main__ - Step: 3580 Loss: 10.2422 LR: 0.000300 +12/09/2025 15:09:17 - INFO - __main__ - Step: 3590 Loss: 10.4056 LR: 0.000300 +12/09/2025 15:09:25 - INFO - __main__ - Step: 3600 Loss: 10.4111 LR: 0.000300 +12/09/2025 15:09:33 - INFO - __main__ - Step: 3610 Loss: 10.4007 LR: 0.000300 +12/09/2025 15:09:41 - INFO - __main__ - Step: 3620 Loss: 10.4743 LR: 0.000300 +12/09/2025 15:09:49 - INFO - __main__ - Step: 3630 Loss: 10.4240 LR: 0.000300 +12/09/2025 15:09:58 - INFO - __main__ - Step: 3640 Loss: 10.4337 LR: 0.000300 +12/09/2025 15:10:06 - INFO - __main__ - Step: 3650 Loss: 10.4613 LR: 0.000300 +12/09/2025 15:10:15 - INFO - __main__ - Step: 3660 Loss: 10.3092 LR: 0.000300 +12/09/2025 15:10:23 - INFO - __main__ - Step: 3670 Loss: 10.4424 LR: 0.000300 +12/09/2025 15:10:31 - INFO - __main__ - Step: 3680 Loss: 10.3409 LR: 0.000300 +12/09/2025 15:10:39 - INFO - __main__ - Step: 3690 Loss: 10.3445 LR: 0.000300 +12/09/2025 15:10:47 - INFO - __main__ - Step: 3700 Loss: 10.4723 LR: 0.000300 +12/09/2025 15:10:56 - INFO - __main__ - Step: 3710 Loss: 10.4960 LR: 0.000300 +12/09/2025 15:11:05 - INFO - __main__ - Step: 3720 Loss: 10.4134 LR: 0.000300 +12/09/2025 15:11:13 - INFO - __main__ - Step: 3730 Loss: 10.2951 LR: 0.000300 +12/09/2025 15:11:22 - INFO - __main__ - Step: 3740 Loss: 10.3510 LR: 0.000300 +12/09/2025 15:11:29 - INFO - __main__ - Step: 3750 Loss: 10.3872 LR: 0.000300 +12/09/2025 15:11:36 - INFO - __main__ - Step: 3760 Loss: 10.3660 LR: 0.000300 +12/09/2025 15:11:45 - INFO - __main__ - Step: 3770 Loss: 10.4696 LR: 0.000300 +12/09/2025 15:11:53 - INFO - __main__ - Step: 3780 Loss: 10.5097 LR: 0.000300 +12/09/2025 15:12:02 - INFO - __main__ - Step: 3790 Loss: 10.3678 LR: 0.000300 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1717, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1422, in main + encoder_hidden_states, cond_embeds = encode_prompt( + File "/mnt/Meissonic/train/dataset_utils.py", line 79, in encode_prompt + outputs = text_encoder(input_ids=input_ids, return_dict=True) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1932, in forward + encoder_outputs = self.encoder( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1100, in forward + layer_outputs = layer_module( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 687, in forward + self_attention_outputs = self.layer[0]( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 603, in forward + attention_output = self.SelfAttention( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 529, in forward + scores = torch.matmul(query_states, key_states.transpose(3, 2)) +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1717, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1422, in main +[rank0]: encoder_hidden_states, cond_embeds = encode_prompt( +[rank0]: File "/mnt/Meissonic/train/dataset_utils.py", line 79, in encode_prompt +[rank0]: outputs = text_encoder(input_ids=input_ids, return_dict=True) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1932, in forward +[rank0]: encoder_outputs = self.encoder( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1100, in forward +[rank0]: layer_outputs = layer_module( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 687, in forward +[rank0]: self_attention_outputs = self.layer[0]( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 603, in forward +[rank0]: attention_output = self.SelfAttention( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 529, in forward +[rank0]: scores = torch.matmul(query_states, key_states.transpose(3, 2)) +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/requirements.txt b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..66f2e09c95c51d0cbe9ea65818c0d8d0982ff0d2 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T14:17:39.551844Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12093763186688" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "ca77dv9hnvwqayawxeokto06mdonx22t" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-core.log b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..44fd07d0bc8833f6606cc4492399ec75cac9cd76 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T14:17:39.622485456Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpv17shkgs/port-73281.txt","pid":73281,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T14:17:39.622942873Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":73281} +{"time":"2025-12-09T14:17:39.622956653Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-73281-73540-1131859293/socket","Net":"unix"}} +{"time":"2025-12-09T14:17:39.809839589Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T14:17:39.815956521Z","level":"INFO","msg":"handleInformInit: received","streamId":"fk5kdvzr","id":"1(@)"} +{"time":"2025-12-09T14:17:39.993579753Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"fk5kdvzr","id":"1(@)"} +{"time":"2025-12-09T15:12:09.52393793Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-internal.log b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..772a69f2f3f3ff59b824f62b485a1eeeac9db259 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T14:17:39.816055654Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T14:17:39.993414647Z","level":"INFO","msg":"stream: created new stream","id":"fk5kdvzr"} +{"time":"2025-12-09T14:17:39.993490315Z","level":"INFO","msg":"handler: started","stream_id":"fk5kdvzr"} +{"time":"2025-12-09T14:17:39.993572987Z","level":"INFO","msg":"stream: started","id":"fk5kdvzr"} +{"time":"2025-12-09T14:17:39.99358997Z","level":"INFO","msg":"writer: started","stream_id":"fk5kdvzr"} +{"time":"2025-12-09T14:17:39.993593276Z","level":"INFO","msg":"sender: started","stream_id":"fk5kdvzr"} diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug.log b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..09232eecfb7e4555a3c1ffeef602046a3362ee1c --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 14:17:39,554 INFO MainThread:73281 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 14:17:39,554 INFO MainThread:73281 [wandb_setup.py:_flush():80] Configure stats pid to 73281 +2025-12-09 14:17:39,554 INFO MainThread:73281 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 14:17:39,554 INFO MainThread:73281 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 14:17:39,554 INFO MainThread:73281 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 14:17:39,555 INFO MainThread:73281 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug.log +2025-12-09 14:17:39,555 INFO MainThread:73281 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_141739-fk5kdvzr/logs/debug-internal.log +2025-12-09 14:17:39,555 INFO MainThread:73281 [wandb_init.py:init():841] calling init triggers +2025-12-09 14:17:39,555 INFO MainThread:73281 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 14:17:39,555 INFO MainThread:73281 [wandb_init.py:init():889] starting backend +2025-12-09 14:17:39,810 INFO MainThread:73281 [wandb_init.py:init():892] sending inform_init request +2025-12-09 14:17:39,814 INFO MainThread:73281 [wandb_init.py:init():900] backend started and connected +2025-12-09 14:17:39,815 INFO MainThread:73281 [wandb_init.py:init():970] updated telemetry +2025-12-09 14:17:39,820 INFO MainThread:73281 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 14:17:40,361 INFO MainThread:73281 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 14:17:40,486 INFO MainThread:73281 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 14:17:40,486 INFO MainThread:73281 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 14:17:40,486 INFO MainThread:73281 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 14:17:40,487 INFO MainThread:73281 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 14:17:40,491 INFO MainThread:73281 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 14:17:40,492 INFO MainThread:73281 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251209_141739-fk5kdvzr/run-fk5kdvzr.wandb b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/run-fk5kdvzr.wandb new file mode 100644 index 0000000000000000000000000000000000000000..66bad48a331f13626a23d69e4c7418893cb91f22 --- /dev/null +++ b/Meissonic/wandb/run-20251209_141739-fk5kdvzr/run-fk5kdvzr.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5039d1255ddff8897a7b3a47e6ddc6c6b07ca052e55c347f09fd06fba0666dca +size 1146880 diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/files/config.yaml b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ab3f705d4de82efd2f8538aae53b0802b6b0abe --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/config.yaml @@ -0,0 +1,301 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + b1w9ewbbbbzrcttqa1g135sn919dk75f: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12133355233280" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T16:05:23.017782Z" + writerId: b1w9ewbbbbzrcttqa1g135sn919dk75f + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/files/output.log b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0ec9ee539add3dfa42a06f96cd221a9eb71bccca --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/output.log @@ -0,0 +1,18 @@ +12/09/2025 16:05:23 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 16:05:23 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 16:05:23 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/09/2025 16:05:23 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 16:05:23 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 16:05:23 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1687, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 620, in main + if sample_path: +NameError: name 'sample_path' is not defined +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1687, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 620, in main +[rank0]: if sample_path: +[rank0]: NameError: name 'sample_path' is not defined diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/files/requirements.txt b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..23823421ac4bca2c8ae3fc2d6bc66dd310ae8744 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:05:23.017782Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133355233280" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "b1w9ewbbbbzrcttqa1g135sn919dk75f" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/files/wandb-summary.json b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1d476fc88692f959c7a899096787abbc21a55dbc --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":0,"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-core.log b/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..3c820d56547238c6f6aa51055df6985677e58904 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T16:05:23.166093053Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpmor8_uof/port-798744.txt","pid":798744,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:05:23.166524827Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":798744} +{"time":"2025-12-09T16:05:23.166534234Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-798744-799009-3280574891/socket","Net":"unix"}} +{"time":"2025-12-09T16:05:23.349730714Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:05:23.357078081Z","level":"INFO","msg":"handleInformInit: received","streamId":"sefi33g5","id":"1(@)"} +{"time":"2025-12-09T16:05:23.540330681Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"sefi33g5","id":"1(@)"} +{"time":"2025-12-09T16:05:23.939027367Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T16:05:23.939066018Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T16:05:23.939064052Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T16:05:23.939122022Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T16:05:23.939132926Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-798744-799009-3280574891/socket","Net":"unix"}} +{"time":"2025-12-09T16:05:24.563153344Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T16:05:24.563181492Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T16:05:24.563190061Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-internal.log b/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ed7dac309c7049421005f6129fc60bd287c4e833 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T16:05:23.35718071Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:05:23.539982439Z","level":"INFO","msg":"stream: created new stream","id":"sefi33g5"} +{"time":"2025-12-09T16:05:23.540208535Z","level":"INFO","msg":"handler: started","stream_id":"sefi33g5"} +{"time":"2025-12-09T16:05:23.540324015Z","level":"INFO","msg":"stream: started","id":"sefi33g5"} +{"time":"2025-12-09T16:05:23.540345419Z","level":"INFO","msg":"writer: started","stream_id":"sefi33g5"} +{"time":"2025-12-09T16:05:23.540345031Z","level":"INFO","msg":"sender: started","stream_id":"sefi33g5"} +{"time":"2025-12-09T16:05:23.939076805Z","level":"INFO","msg":"stream: closing","id":"sefi33g5"} +{"time":"2025-12-09T16:05:24.440688479Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T16:05:24.558111562Z","level":"INFO","msg":"handler: closed","stream_id":"sefi33g5"} +{"time":"2025-12-09T16:05:24.558188368Z","level":"INFO","msg":"sender: closed","stream_id":"sefi33g5"} +{"time":"2025-12-09T16:05:24.558193939Z","level":"INFO","msg":"stream: closed","id":"sefi33g5"} diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug.log b/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..526c3d73ac0171a010e8970451182a0e63181075 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_setup.py:_flush():80] Configure stats pid to 798744 +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug.log +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_160523-sefi33g5/logs/debug-internal.log +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:05:23,020 INFO MainThread:798744 [wandb_init.py:init():889] starting backend +2025-12-09 16:05:23,349 INFO MainThread:798744 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:05:23,355 INFO MainThread:798744 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:05:23,356 INFO MainThread:798744 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:05:23,361 INFO MainThread:798744 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:05:23,799 INFO MainThread:798744 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:05:23,924 INFO MainThread:798744 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:05:23,925 INFO MainThread:798744 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:05:23,925 INFO MainThread:798744 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:05:23,925 INFO MainThread:798744 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:05:23,929 INFO MainThread:798744 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:05:23,930 INFO MainThread:798744 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 16:05:23,939 INFO wandb-AsyncioManager-main:798744 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 16:05:23,939 INFO wandb-AsyncioManager-main:798744 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_160523-sefi33g5/run-sefi33g5.wandb b/Meissonic/wandb/run-20251209_160523-sefi33g5/run-sefi33g5.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7fd549ac0baaa7a709f06e33567467d01a531670 Binary files /dev/null and b/Meissonic/wandb/run-20251209_160523-sefi33g5/run-sefi33g5.wandb differ diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/files/config.yaml b/Meissonic/wandb/run-20251209_160614-00otapa7/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0eac4fdea382e3a37c4543136575d402f9618b9e --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/files/config.yaml @@ -0,0 +1,301 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + udoaiv9x0g97untt15waac5tzywks3cq: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12133355352064" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T16:06:14.295708Z" + writerId: udoaiv9x0g97untt15waac5tzywks3cq + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/files/output.log b/Meissonic/wandb/run-20251209_160614-00otapa7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..69575f236ed6b937da5d3d7939b6aef16053b720 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/files/output.log @@ -0,0 +1,54 @@ +12/09/2025 16:06:15 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 16:06:15 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 16:06:15 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/09/2025 16:06:15 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 16:06:15 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 16:06:15 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 16:06:15 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 16:06:15 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:06:15 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 16:06:31 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:06:31 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:06:34 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 16:06:36 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 16:06:36 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 16:06:36 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 16:06:36 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 16:06:36 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 16:06:36 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 16:06:36 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 16:06:36 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 16:06:36 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 16:06:36 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/09/2025 16:06:36 - INFO - __main__ - Dataloader configuration: +12/09/2025 16:06:36 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 16:06:36 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 16:06:36 - INFO - __main__ - - persistent_workers: True +12/09/2025 16:06:36 - INFO - __main__ - - pin_memory: True +12/09/2025 16:06:36 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 16:06:37 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 16:06:37 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 16:06:37 - INFO - __main__ - ***** Running training ***** +12/09/2025 16:06:37 - INFO - __main__ - Num training steps = 10000 +12/09/2025 16:06:37 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 16:06:37 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/09/2025 16:06:37 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG] video_tokens: shape=torch.Size([1, 5, 60, 106]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/09/2025 16:06:38 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1687, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1493, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +NotImplementedError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1687, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1493, in main +[rank0]: loss = F.cross_entropy( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: NotImplementedError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int' diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/files/requirements.txt b/Meissonic/wandb/run-20251209_160614-00otapa7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_160614-00otapa7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e2fd3275739a2894d9a3710f50005819f7cb05a5 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:06:14.295708Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133355352064" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "udoaiv9x0g97untt15waac5tzywks3cq" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/files/wandb-summary.json b/Meissonic/wandb/run-20251209_160614-00otapa7/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6735d66125b699c0dbd8ec56cf36ff05c01fa59e --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":68,"_wandb":{"runtime":68}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-core.log b/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..a159cee46270a7fb58bfbb4cc369ab914f8edbbb --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T16:06:14.367769937Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpqm293gl8/port-799765.txt","pid":799765,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:06:14.368350355Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":799765} +{"time":"2025-12-09T16:06:14.368359779Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-799765-800014-3124456346/socket","Net":"unix"}} +{"time":"2025-12-09T16:06:14.554246528Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:06:14.560223528Z","level":"INFO","msg":"handleInformInit: received","streamId":"00otapa7","id":"1(@)"} +{"time":"2025-12-09T16:06:14.727400643Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"00otapa7","id":"1(@)"} +{"time":"2025-12-09T16:07:23.851592211Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T16:07:23.851675183Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T16:07:23.851702883Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T16:07:23.851817322Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T16:07:23.852027714Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-799765-800014-3124456346/socket","Net":"unix"}} +{"time":"2025-12-09T16:07:24.186884958Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T16:07:24.186921655Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T16:07:24.186934204Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-internal.log b/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f8767d43597ab3f3ef82f1e1c5e47c3492367fb2 --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T16:06:14.56032165Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:06:14.727184038Z","level":"INFO","msg":"stream: created new stream","id":"00otapa7"} +{"time":"2025-12-09T16:06:14.727271225Z","level":"INFO","msg":"handler: started","stream_id":"00otapa7"} +{"time":"2025-12-09T16:06:14.727394239Z","level":"INFO","msg":"stream: started","id":"00otapa7"} +{"time":"2025-12-09T16:06:14.727432914Z","level":"INFO","msg":"sender: started","stream_id":"00otapa7"} +{"time":"2025-12-09T16:06:14.727432154Z","level":"INFO","msg":"writer: started","stream_id":"00otapa7"} +{"time":"2025-12-09T16:07:23.851661162Z","level":"INFO","msg":"stream: closing","id":"00otapa7"} +{"time":"2025-12-09T16:07:24.081209759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T16:07:24.183666884Z","level":"INFO","msg":"handler: closed","stream_id":"00otapa7"} +{"time":"2025-12-09T16:07:24.183798562Z","level":"INFO","msg":"sender: closed","stream_id":"00otapa7"} +{"time":"2025-12-09T16:07:24.183810463Z","level":"INFO","msg":"stream: closed","id":"00otapa7"} diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug.log b/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..da47719c3e20217832fe8b45fe77e258a9adc5da --- /dev/null +++ b/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 16:06:14,298 INFO MainThread:799765 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:06:14,298 INFO MainThread:799765 [wandb_setup.py:_flush():80] Configure stats pid to 799765 +2025-12-09 16:06:14,298 INFO MainThread:799765 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:06:14,298 INFO MainThread:799765 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:06:14,298 INFO MainThread:799765 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:06:14,298 INFO MainThread:799765 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug.log +2025-12-09 16:06:14,299 INFO MainThread:799765 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_160614-00otapa7/logs/debug-internal.log +2025-12-09 16:06:14,299 INFO MainThread:799765 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:06:14,299 INFO MainThread:799765 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:06:14,299 INFO MainThread:799765 [wandb_init.py:init():889] starting backend +2025-12-09 16:06:14,554 INFO MainThread:799765 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:06:14,558 INFO MainThread:799765 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:06:14,560 INFO MainThread:799765 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:06:14,564 INFO MainThread:799765 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:06:14,955 INFO MainThread:799765 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:06:15,080 INFO MainThread:799765 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:06:15,081 INFO MainThread:799765 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:06:15,081 INFO MainThread:799765 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:06:15,081 INFO MainThread:799765 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:06:15,084 INFO MainThread:799765 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:06:15,085 INFO MainThread:799765 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 16:07:23,851 INFO wandb-AsyncioManager-main:799765 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 16:07:23,852 INFO wandb-AsyncioManager-main:799765 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_160614-00otapa7/run-00otapa7.wandb b/Meissonic/wandb/run-20251209_160614-00otapa7/run-00otapa7.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d9c82880dd9ad2a6f4fc9eaa33db11173e79510d Binary files /dev/null and b/Meissonic/wandb/run-20251209_160614-00otapa7/run-00otapa7.wandb differ diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/config.yaml b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ddf814eef927384674503e03a98ef97b062bae7e --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/config.yaml @@ -0,0 +1,301 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + jp7apqd2y7dvf73te9x78rw0nrf3puvr: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12133355528192" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T16:10:19.255348Z" + writerId: jp7apqd2y7dvf73te9x78rw0nrf3puvr + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/output.log b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1c025b9f2312f7d45591ff20a2cd310ccab23c7c --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/output.log @@ -0,0 +1,155 @@ +12/09/2025 16:10:20 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 16:10:20 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 16:10:20 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/09/2025 16:10:20 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 16:10:20 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 16:10:20 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 16:10:20 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 16:10:20 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:10:20 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 16:10:36 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:10:36 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:10:38 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 16:10:40 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 16:10:40 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 16:10:40 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 16:10:40 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 16:10:40 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 16:10:40 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 16:10:40 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 16:10:40 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 16:10:40 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 16:10:40 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/09/2025 16:10:40 - INFO - __main__ - Dataloader configuration: +12/09/2025 16:10:40 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 16:10:40 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 16:10:40 - INFO - __main__ - - persistent_workers: True +12/09/2025 16:10:40 - INFO - __main__ - - pin_memory: True +12/09/2025 16:10:40 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 16:10:42 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 16:10:42 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 16:10:42 - INFO - __main__ - ***** Running training ***** +12/09/2025 16:10:42 - INFO - __main__ - Num training steps = 10000 +12/09/2025 16:10:42 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 16:10:42 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/09/2025 16:10:42 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG] video_tokens: shape=torch.Size([1, 5, 60, 106]), dtype=torch.int64, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/09/2025 16:10:43 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1689, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1506, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook + _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn + fn(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward + x = cross_attn_ffn(x, context, context_lens, e) + File "/mnt/Meissonic/src/transformer_video.py", line 484, in cross_attn_ffn + y = self.ffn(ffn_input) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward + input = module(input) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward + return F.gelu(input, approximate=self.approximate) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 75.56 MiB is free. Including non-PyTorch memory, this process has 39.41 GiB memory in use. Of the allocated memory 38.15 GiB is allocated by PyTorch, and 155.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1689, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1506, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook +[rank0]: _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn +[rank0]: fn(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward +[rank0]: x = cross_attn_ffn(x, context, context_lens, e) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 484, in cross_attn_ffn +[rank0]: y = self.ffn(ffn_input) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward +[rank0]: input = module(input) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward +[rank0]: return F.gelu(input, approximate=self.approximate) +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 75.56 MiB is free. Including non-PyTorch memory, this process has 39.41 GiB memory in use. Of the allocated memory 38.15 GiB is allocated by PyTorch, and 155.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1648, in _clean_up_worker + w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/process.py", line 149, in join + res = self._popen.wait(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait + if not wait([self.sentinel], timeout): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt: diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/requirements.txt b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f8d73b037961f425dbbc35e44b5dd3a9ef4facd3 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:10:19.255348Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133355528192" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "jp7apqd2y7dvf73te9x78rw0nrf3puvr" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/wandb-summary.json b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c2c05989968aab8f861b59f70eec7de82b16820e --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":52},"_runtime":52} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-core.log b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..a0a9163d675dd0d4341699e0397ce22914c33549 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T16:10:19.325772351Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmplk9fg4nn/port-804202.txt","pid":804202,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:10:19.326353613Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":804202} +{"time":"2025-12-09T16:10:19.326328884Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-804202-804449-3058100521/socket","Net":"unix"}} +{"time":"2025-12-09T16:10:19.512203624Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:10:19.518374741Z","level":"INFO","msg":"handleInformInit: received","streamId":"ca5tv9y1","id":"1(@)"} +{"time":"2025-12-09T16:10:19.684904074Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ca5tv9y1","id":"1(@)"} +{"time":"2025-12-09T16:11:12.802727993Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T16:11:12.802814833Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T16:11:12.802800761Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T16:11:12.802931646Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T16:11:12.802930719Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-804202-804449-3058100521/socket","Net":"unix"}} +{"time":"2025-12-09T16:11:13.183150726Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T16:11:13.183170626Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T16:11:13.18317985Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-internal.log b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cfa4cb962d701e9ec9fff00e28eeb19bc3126a76 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T16:10:19.518476001Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:10:19.684672449Z","level":"INFO","msg":"stream: created new stream","id":"ca5tv9y1"} +{"time":"2025-12-09T16:10:19.684757371Z","level":"INFO","msg":"handler: started","stream_id":"ca5tv9y1"} +{"time":"2025-12-09T16:10:19.684895948Z","level":"INFO","msg":"stream: started","id":"ca5tv9y1"} +{"time":"2025-12-09T16:10:19.684911695Z","level":"INFO","msg":"writer: started","stream_id":"ca5tv9y1"} +{"time":"2025-12-09T16:10:19.684914721Z","level":"INFO","msg":"sender: started","stream_id":"ca5tv9y1"} +{"time":"2025-12-09T16:11:12.802806611Z","level":"INFO","msg":"stream: closing","id":"ca5tv9y1"} +{"time":"2025-12-09T16:11:13.06038557Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T16:11:13.180327674Z","level":"INFO","msg":"handler: closed","stream_id":"ca5tv9y1"} +{"time":"2025-12-09T16:11:13.180418814Z","level":"INFO","msg":"sender: closed","stream_id":"ca5tv9y1"} +{"time":"2025-12-09T16:11:13.180425983Z","level":"INFO","msg":"stream: closed","id":"ca5tv9y1"} diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug.log b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..014320019c61c76e7fbd692d9b199538482b5751 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_setup.py:_flush():80] Configure stats pid to 804202 +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug.log +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_161019-ca5tv9y1/logs/debug-internal.log +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:10:19,258 INFO MainThread:804202 [wandb_init.py:init():889] starting backend +2025-12-09 16:10:19,512 INFO MainThread:804202 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:10:19,516 INFO MainThread:804202 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:10:19,518 INFO MainThread:804202 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:10:19,522 INFO MainThread:804202 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:10:20,086 INFO MainThread:804202 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:10:20,211 INFO MainThread:804202 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:10:20,211 INFO MainThread:804202 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:10:20,211 INFO MainThread:804202 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:10:20,211 INFO MainThread:804202 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:10:20,214 INFO MainThread:804202 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:10:20,215 INFO MainThread:804202 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 16:11:12,802 INFO wandb-AsyncioManager-main:804202 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 16:11:12,803 INFO wandb-AsyncioManager-main:804202 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_161019-ca5tv9y1/run-ca5tv9y1.wandb b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/run-ca5tv9y1.wandb new file mode 100644 index 0000000000000000000000000000000000000000..883786c305882e782e813676c57a238a64ce3ef8 Binary files /dev/null and b/Meissonic/wandb/run-20251209_161019-ca5tv9y1/run-ca5tv9y1.wandb differ diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/files/config.yaml b/Meissonic/wandb/run-20251209_161324-onokk16i/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..243c66905f46c26266b0a583226f49f43a54d59e --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/files/config.yaml @@ -0,0 +1,301 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + efbkiwa6x1upmhno4asmii8gb5l9pte0: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "1" + - --num_frames + - "4" + - --video_height + - "256" + - --video_width + - "448" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "1" + - --gradient_accumulation_steps + - "1" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12133355716608" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T16:13:24.569020Z" + writerId: efbkiwa6x1upmhno4asmii8gb5l9pte0 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 4 +output_dir: + value: ./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 1 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 448 +wan_backbone_lr_ratio: + value: 1 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/files/output.log b/Meissonic/wandb/run-20251209_161324-onokk16i/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..008ac1dbee8a494dfbb8183d6c6c5c3bbc2a28d0 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/files/output.log @@ -0,0 +1,142 @@ +12/09/2025 16:13:25 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/09/2025 16:13:25 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/09/2025 16:13:25 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/09/2025 16:13:25 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/09/2025 16:13:25 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/09/2025 16:13:25 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=60, W'=106 +12/09/2025 16:13:25 - INFO - __main__ - Got text_dim from metadata: 4096 +12/09/2025 16:13:25 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:13:26 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 16:13:42 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:13:42 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:13:44 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 16:13:46 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 16:13:46 - INFO - __main__ - Wan backbone lr = 0.000300 (base_lr * 1.0) +12/09/2025 16:13:46 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 16:13:46 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 16:13:46 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/09/2025 16:13:46 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/09/2025 16:13:46 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/09/2025 16:13:46 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/09/2025 16:13:46 - INFO - train.dataset_utils - Index range: 0 to 127 +12/09/2025 16:13:46 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/09/2025 16:13:46 - INFO - __main__ - Dataloader configuration: +12/09/2025 16:13:46 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 16:13:46 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 16:13:46 - INFO - __main__ - - persistent_workers: True +12/09/2025 16:13:46 - INFO - __main__ - - pin_memory: True +12/09/2025 16:13:46 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 16:13:48 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/09/2025 16:13:48 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/09/2025 16:13:48 - INFO - __main__ - ***** Running training ***** +12/09/2025 16:13:48 - INFO - __main__ - Num training steps = 10000 +12/09/2025 16:13:48 - INFO - __main__ - Instantaneous batch size per device = 1 +12/09/2025 16:13:48 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 8 +12/09/2025 16:13:48 - INFO - __main__ - Gradient Accumulation steps = 1 +[DEBUG] video_tokens: shape=torch.Size([1, 5, 60, 106]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/09/2025 16:13:49 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook + _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn + fn(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward + x = cross_attn_ffn(x, context, context_lens, e) + File "/mnt/Meissonic/src/transformer_video.py", line 484, in cross_attn_ffn + y = self.ffn(ffn_input) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward + input = module(input) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward + return F.gelu(input, approximate=self.approximate) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 75.56 MiB is free. Including non-PyTorch memory, this process has 39.41 GiB memory in use. Of the allocated memory 38.15 GiB is allocated by PyTorch, and 156.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1154, in unpack_hook +[rank0]: _run_fn_with_dynamo_disabled(frame.recompute_fn, *args) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1124, in _run_fn_with_dynamo_disabled +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1555, in recompute_fn +[rank0]: fn(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward +[rank0]: x = cross_attn_ffn(x, context, context_lens, e) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 484, in cross_attn_ffn +[rank0]: y = self.ffn(ffn_input) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward +[rank0]: input = module(input) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 816, in forward +[rank0]: return F.gelu(input, approximate=self.approximate) +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 75.56 MiB is free. Including non-PyTorch memory, this process has 39.41 GiB memory in use. Of the allocated memory 38.15 GiB is allocated by PyTorch, and 156.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/files/requirements.txt b/Meissonic/wandb/run-20251209_161324-onokk16i/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_161324-onokk16i/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..dd6f0336b28041ab148e75118b6586d05615d6b1 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:13:24.569020Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "1", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "1", + "--gradient_accumulation_steps", + "1", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133355716608" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "efbkiwa6x1upmhno4asmii8gb5l9pte0" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/files/wandb-summary.json b/Meissonic/wandb/run-20251209_161324-onokk16i/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6735d66125b699c0dbd8ec56cf36ff05c01fa59e --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":68,"_wandb":{"runtime":68}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-core.log b/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..325943e78d5efa6fb978d29aa2beb95833274567 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T16:13:24.638327764Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpqmklcgtg/port-807619.txt","pid":807619,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:13:24.639641115Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":807619} +{"time":"2025-12-09T16:13:24.639636363Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-807619-807867-2491286150/socket","Net":"unix"}} +{"time":"2025-12-09T16:13:24.824846418Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:13:24.831086373Z","level":"INFO","msg":"handleInformInit: received","streamId":"onokk16i","id":"1(@)"} +{"time":"2025-12-09T16:13:25.002141481Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"onokk16i","id":"1(@)"} +{"time":"2025-12-09T16:14:33.950717019Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T16:14:33.95095849Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T16:14:33.951019432Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T16:14:33.951026191Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T16:14:33.95109025Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-807619-807867-2491286150/socket","Net":"unix"}} +{"time":"2025-12-09T16:14:34.306036996Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T16:14:34.306059112Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T16:14:34.306068898Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-internal.log b/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b50ed98ebd95a50e243e925ce9c0e22e10921834 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T16:13:24.831225512Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:13:25.001968424Z","level":"INFO","msg":"stream: created new stream","id":"onokk16i"} +{"time":"2025-12-09T16:13:25.002047757Z","level":"INFO","msg":"handler: started","stream_id":"onokk16i"} +{"time":"2025-12-09T16:13:25.002133131Z","level":"INFO","msg":"stream: started","id":"onokk16i"} +{"time":"2025-12-09T16:13:25.002148909Z","level":"INFO","msg":"writer: started","stream_id":"onokk16i"} +{"time":"2025-12-09T16:13:25.002150129Z","level":"INFO","msg":"sender: started","stream_id":"onokk16i"} +{"time":"2025-12-09T16:14:33.950793808Z","level":"INFO","msg":"stream: closing","id":"onokk16i"} +{"time":"2025-12-09T16:14:34.204038017Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T16:14:34.303757866Z","level":"INFO","msg":"handler: closed","stream_id":"onokk16i"} +{"time":"2025-12-09T16:14:34.303882509Z","level":"INFO","msg":"sender: closed","stream_id":"onokk16i"} +{"time":"2025-12-09T16:14:34.303890895Z","level":"INFO","msg":"stream: closed","id":"onokk16i"} diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug.log b/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..c36c76d2a43212ba68b19039f15926c3ca751a10 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_setup.py:_flush():80] Configure stats pid to 807619 +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug.log +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_161324-onokk16i/logs/debug-internal.log +2025-12-09 16:13:24,571 INFO MainThread:807619 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:13:24,572 INFO MainThread:807619 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:13:24,572 INFO MainThread:807619 [wandb_init.py:init():889] starting backend +2025-12-09 16:13:24,825 INFO MainThread:807619 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:13:24,829 INFO MainThread:807619 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:13:24,830 INFO MainThread:807619 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:13:24,835 INFO MainThread:807619 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:13:25,804 INFO MainThread:807619 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:13:25,926 INFO MainThread:807619 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:13:25,926 INFO MainThread:807619 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:13:25,926 INFO MainThread:807619 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:13:25,926 INFO MainThread:807619 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:13:25,929 INFO MainThread:807619 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:13:25,930 INFO MainThread:807619 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 1.0, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-09 16:14:33,949 INFO wandb-AsyncioManager-main:807619 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 16:14:33,950 INFO wandb-AsyncioManager-main:807619 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_161324-onokk16i/run-onokk16i.wandb b/Meissonic/wandb/run-20251209_161324-onokk16i/run-onokk16i.wandb new file mode 100644 index 0000000000000000000000000000000000000000..c0fc332fc4393eb42b75cd6642bf86b650bf0a4f Binary files /dev/null and b/Meissonic/wandb/run-20251209_161324-onokk16i/run-onokk16i.wandb differ diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/files/output.log b/Meissonic/wandb/run-20251209_161644-bp798sa7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..743119193f60c61acd582f9cd7c04f59b3db82e1 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161644-bp798sa7/files/output.log @@ -0,0 +1,62 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 66.82it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 16:16:48 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4936.13it/s] +12/09/2025 16:16:50 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=2, H'=32, W'=56 +12/09/2025 16:16:50 - INFO - __main__ - Theoretical dimensions: F'=1, H'=32, W'=56 +12/09/2025 16:16:50 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:16:50 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 16:17:06 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:17:06 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:17:07 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 16:17:10 - INFO - __main__ - Wan backbone parameters are frozen (requires_grad=False) +12/09/2025 16:17:10 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 16:17:10 - INFO - __main__ - Wan backbone is frozen (lr=0) +12/09/2025 16:17:10 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 16:17:10 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 16:17:17 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 16:17:17 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 16:17:17 - INFO - __main__ - Dataloader configuration: +12/09/2025 16:17:17 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 16:17:17 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 16:17:17 - INFO - __main__ - - persistent_workers: True +12/09/2025 16:17:17 - INFO - __main__ - - pin_memory: True +12/09/2025 16:17:17 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 16:17:34 - INFO - __main__ - ***** Running training ***** +12/09/2025 16:17:34 - INFO - __main__ - Num training steps = 10000 +12/09/2025 16:17:34 - INFO - __main__ - Instantaneous batch size per device = 2 +12/09/2025 16:17:34 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 16:17:34 - INFO - __main__ - Gradient Accumulation steps = 4 +12/09/2025 16:18:08 - INFO - __main__ - Step: 10 Loss: 11.1675 LR: 0.000000 +12/09/2025 16:18:33 - INFO - __main__ - Step: 20 Loss: 11.2165 LR: 0.000000 +12/09/2025 16:18:59 - INFO - __main__ - Step: 30 Loss: 11.2436 LR: 0.000000 +12/09/2025 16:19:25 - INFO - __main__ - Step: 40 Loss: 11.1650 LR: 0.000000 +12/09/2025 16:19:50 - INFO - __main__ - Step: 50 Loss: 11.2041 LR: 0.000000 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/files/requirements.txt b/Meissonic/wandb/run-20251209_161644-bp798sa7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_161644-bp798sa7/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_161644-bp798sa7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..df3be0f2fac448749760ea44e3c51b2892ed13dd --- /dev/null +++ b/Meissonic/wandb/run-20251209_161644-bp798sa7/files/wandb-metadata.json @@ -0,0 +1,154 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:16:44.765090Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--freeze_wan_backbone", + "--wan_backbone_lr_ratio", + "0.0", + "--num_frames", + "4", + "--video_height", + "256", + "--video_width", + "448", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133355880448" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "3192wi3a06uq5u8scltoogaxc8idi9af" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-core.log b/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..47d904de5f67ea31c9659907f58f11e6e155e3a7 --- /dev/null +++ b/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T16:16:44.836517546Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpr2ku7q4m/port-811188.txt","pid":811188,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:16:44.837153974Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":811188} +{"time":"2025-12-09T16:16:44.837131302Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-811188-811451-2759201103/socket","Net":"unix"}} +{"time":"2025-12-09T16:16:45.022783266Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:16:45.028720923Z","level":"INFO","msg":"handleInformInit: received","streamId":"bp798sa7","id":"1(@)"} +{"time":"2025-12-09T16:16:45.198649239Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"bp798sa7","id":"1(@)"} +{"time":"2025-12-09T16:20:14.001036771Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-internal.log b/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2ba5c55b7ea6b2d4ce753404d9887269b5e44f0b --- /dev/null +++ b/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T16:16:45.028818035Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:16:45.198406717Z","level":"INFO","msg":"stream: created new stream","id":"bp798sa7"} +{"time":"2025-12-09T16:16:45.198506901Z","level":"INFO","msg":"handler: started","stream_id":"bp798sa7"} +{"time":"2025-12-09T16:16:45.198641791Z","level":"INFO","msg":"stream: started","id":"bp798sa7"} +{"time":"2025-12-09T16:16:45.1986524Z","level":"INFO","msg":"writer: started","stream_id":"bp798sa7"} +{"time":"2025-12-09T16:16:45.198662453Z","level":"INFO","msg":"sender: started","stream_id":"bp798sa7"} diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug.log b/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7ae08460627b46617a54bdfdb270c3685edcd78c --- /dev/null +++ b/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_setup.py:_flush():80] Configure stats pid to 811188 +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug.log +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_161644-bp798sa7/logs/debug-internal.log +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:16:44,768 INFO MainThread:811188 [wandb_init.py:init():889] starting backend +2025-12-09 16:16:45,023 INFO MainThread:811188 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:16:45,027 INFO MainThread:811188 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:16:45,028 INFO MainThread:811188 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:16:45,032 INFO MainThread:811188 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:16:45,622 INFO MainThread:811188 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:16:45,751 INFO MainThread:811188 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:16:45,751 INFO MainThread:811188 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:16:45,752 INFO MainThread:811188 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:16:45,752 INFO MainThread:811188 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:16:45,755 INFO MainThread:811188 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:16:45,756 INFO MainThread:811188 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 4, 'video_height': 256, 'video_width': 448, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': True, 'wan_backbone_lr_ratio': 0.0, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251209_161644-bp798sa7/run-bp798sa7.wandb b/Meissonic/wandb/run-20251209_161644-bp798sa7/run-bp798sa7.wandb new file mode 100644 index 0000000000000000000000000000000000000000..5412808f967bb22f92b499679d77de8484ad33d1 Binary files /dev/null and b/Meissonic/wandb/run-20251209_161644-bp798sa7/run-bp798sa7.wandb differ diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/files/config.yaml b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..901989dc1946d790939451b73f93185bb6722b8c --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/config.yaml @@ -0,0 +1,298 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + p7qmb4hazkd40rqjeu38ugero566r4ft: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "16" + - --video_height + - "180" + - --video_width + - "320" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "2" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12133356040192" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T16:21:04.467894Z" + writerId: p7qmb4hazkd40rqjeu38ugero566r4ft + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: null +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 2 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 180 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 320 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/files/output.log b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..ecef062613a57610b4deda9404546704300884e2 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/output.log @@ -0,0 +1,125 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 70.72it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 16:21:07 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4878.72it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 632, in main + dummy_tokens = video_tokenizer.encode(dummy_video) # [1, F', H', W'] + File "/mnt/Meissonic/src/pipeline_video.py", line 181, in encode + result = self.encoder(video) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) +RuntimeError: The following operation failed in the TorchScript interpreter. +Traceback of TorchScript, serialized code (most recent call last): + File "code/__torch__/torch/nn/modules/container/___torch_mangle_13750.py", line 14, in forward + quant_conv = self.quant_conv + encoder = self.encoder + _0 = (quant_conv).forward((encoder).forward(input, ), ) + ~~~~~~~~~~~~~~~~ <--- HERE + _1, _2, _3, = (quantizer).forward(_0, ) + return (_1, _2, _3) + File "code/__torch__/projects/edify_tokenizer/v1/module/layers3d/___torch_mangle_13744.py", line 56, in forward + _5 = (conv_in).forward((patcher3d).forward(input, ), ) + _6 = (_14).forward((_04).forward(_5, ), ) + _7 = (_00).forward((downsample0).forward(_6, ), ) + ~~~~~~~~~~~~~~~~~~~~ <--- HERE + _3 = (_12).forward(_7, ) + _4 = (downsample).forward() + File "code/__torch__/projects/edify_tokenizer/v1/module/layers3d/___torch_mangle_13597.py", line 16, in forward + _0 = (conv1).forward(x, ) + x2 = torch.avg_pool3d(x, [1, 2, 2], [1, 2, 2], [0, 0, 0]) + x0 = torch.add(_0, x2) + ~~~~~~~~~ <--- HERE + return (conv3).forward(x0, ) + +Traceback of TorchScript, original code (most recent call last): +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/module/layers3d.py(193): forward +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/module/layers3d.py(680): forward +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py(218): forward +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl +/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl +/usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(1274): trace_module +/usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(694): _trace_impl +/usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(999): trace +/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py(574): _fn +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/checkpointer.py(270): _get_ema_jit +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/checkpointer.py(94): save +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/trainer.py(139): train +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/scripts/train.py(44): launch +/usr/local/lib/python3.10/dist-packages/loguru/_logger.py(1277): catch_wrapper +/lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/scripts/train.py(80): +/usr/lib/python3.10/runpy.py(86): _run_code +/usr/lib/python3.10/runpy.py(196): _run_module_as_main +RuntimeError: The size of tensor a (22) must match the size of tensor b (23) at non-singleton dimension 3 + +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 632, in main +[rank0]: dummy_tokens = video_tokenizer.encode(dummy_video) # [1, F', H', W'] +[rank0]: File "/mnt/Meissonic/src/pipeline_video.py", line 181, in encode +[rank0]: result = self.encoder(video) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: RuntimeError: The following operation failed in the TorchScript interpreter. +[rank0]: Traceback of TorchScript, serialized code (most recent call last): +[rank0]: File "code/__torch__/torch/nn/modules/container/___torch_mangle_13750.py", line 14, in forward +[rank0]: quant_conv = self.quant_conv +[rank0]: encoder = self.encoder +[rank0]: _0 = (quant_conv).forward((encoder).forward(input, ), ) +[rank0]: ~~~~~~~~~~~~~~~~ <--- HERE +[rank0]: _1, _2, _3, = (quantizer).forward(_0, ) +[rank0]: return (_1, _2, _3) +[rank0]: File "code/__torch__/projects/edify_tokenizer/v1/module/layers3d/___torch_mangle_13744.py", line 56, in forward +[rank0]: _5 = (conv_in).forward((patcher3d).forward(input, ), ) +[rank0]: _6 = (_14).forward((_04).forward(_5, ), ) +[rank0]: _7 = (_00).forward((downsample0).forward(_6, ), ) +[rank0]: ~~~~~~~~~~~~~~~~~~~~ <--- HERE +[rank0]: _3 = (_12).forward(_7, ) +[rank0]: _4 = (downsample).forward() +[rank0]: File "code/__torch__/projects/edify_tokenizer/v1/module/layers3d/___torch_mangle_13597.py", line 16, in forward +[rank0]: _0 = (conv1).forward(x, ) +[rank0]: x2 = torch.avg_pool3d(x, [1, 2, 2], [1, 2, 2], [0, 0, 0]) +[rank0]: x0 = torch.add(_0, x2) +[rank0]: ~~~~~~~~~ <--- HERE +[rank0]: return (conv3).forward(x0, ) + +[rank0]: Traceback of TorchScript, original code (most recent call last): +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/module/layers3d.py(193): forward +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/module/layers3d.py(680): forward +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py(218): forward +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(1274): trace_module +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(694): _trace_impl +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(999): trace +[rank0]: /usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py(574): _fn +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/checkpointer.py(270): _get_ema_jit +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/checkpointer.py(94): save +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/projects/edify_tokenizer/v1/trainer.py(139): train +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/scripts/train.py(44): launch +[rank0]: /usr/local/lib/python3.10/dist-packages/loguru/_logger.py(1277): catch_wrapper +[rank0]: /lustre/fs3/portfolios/nvr/users/jinweig/edify_tokenizer1/cosmos/DV720_Causal_FSQ_T17_f4x8x8/scripts/train.py(80): +[rank0]: /usr/lib/python3.10/runpy.py(86): _run_code +[rank0]: /usr/lib/python3.10/runpy.py(196): _run_module_as_main +[rank0]: RuntimeError: The size of tensor a (22) must match the size of tensor b (23) at non-singleton dimension 3 diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/files/requirements.txt b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8c277a624c43877f9766a41027cc20e93775ea73 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:21:04.467894Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "16", + "--video_height", + "180", + "--video_width", + "320", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133356040192" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "p7qmb4hazkd40rqjeu38ugero566r4ft" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/files/wandb-summary.json b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..090134c0bee209a0eb6edb4ebf8ccb739b16b3f9 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":3,"_wandb":{"runtime":3}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-core.log b/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b51bac218e7130961e35337c6eed5c7e9a55462b --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T16:21:04.538365948Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp52270int/port-886533.txt","pid":886533,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:21:04.538886732Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":886533} +{"time":"2025-12-09T16:21:04.538872677Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-886533-886801-2796040377/socket","Net":"unix"}} +{"time":"2025-12-09T16:21:04.72471488Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:21:04.730804303Z","level":"INFO","msg":"handleInformInit: received","streamId":"bumjum6e","id":"1(@)"} +{"time":"2025-12-09T16:21:04.901056649Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"bumjum6e","id":"1(@)"} +{"time":"2025-12-09T16:21:09.118999185Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T16:21:09.119058085Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T16:21:09.119054219Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T16:21:09.119113772Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T16:21:09.119178717Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-886533-886801-2796040377/socket","Net":"unix"}} +{"time":"2025-12-09T16:21:09.629171629Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T16:21:09.629191641Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T16:21:09.629200445Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-internal.log b/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c77e4535266b1aed21bcc51de67297eba08f8cfc --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T16:21:04.730922438Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:21:04.900827158Z","level":"INFO","msg":"stream: created new stream","id":"bumjum6e"} +{"time":"2025-12-09T16:21:04.900930312Z","level":"INFO","msg":"handler: started","stream_id":"bumjum6e"} +{"time":"2025-12-09T16:21:04.901050385Z","level":"INFO","msg":"stream: started","id":"bumjum6e"} +{"time":"2025-12-09T16:21:04.901059991Z","level":"INFO","msg":"writer: started","stream_id":"bumjum6e"} +{"time":"2025-12-09T16:21:04.901067049Z","level":"INFO","msg":"sender: started","stream_id":"bumjum6e"} +{"time":"2025-12-09T16:21:09.119061953Z","level":"INFO","msg":"stream: closing","id":"bumjum6e"} +{"time":"2025-12-09T16:21:09.520970004Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T16:21:09.627220431Z","level":"INFO","msg":"handler: closed","stream_id":"bumjum6e"} +{"time":"2025-12-09T16:21:09.627324636Z","level":"INFO","msg":"sender: closed","stream_id":"bumjum6e"} +{"time":"2025-12-09T16:21:09.627331413Z","level":"INFO","msg":"stream: closed","id":"bumjum6e"} diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug.log b/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5dca6b8a923039808b7a4c91e731883ab398c788 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 16:21:04,470 INFO MainThread:886533 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_setup.py:_flush():80] Configure stats pid to 886533 +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug.log +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_162104-bumjum6e/logs/debug-internal.log +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:21:04,471 INFO MainThread:886533 [wandb_init.py:init():889] starting backend +2025-12-09 16:21:04,724 INFO MainThread:886533 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:21:04,729 INFO MainThread:886533 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:21:04,731 INFO MainThread:886533 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:21:04,736 INFO MainThread:886533 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:21:05,183 INFO MainThread:886533 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:21:05,307 INFO MainThread:886533 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:21:05,307 INFO MainThread:886533 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:21:05,308 INFO MainThread:886533 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:21:05,308 INFO MainThread:886533 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:21:05,310 INFO MainThread:886533 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:21:05,311 INFO MainThread:886533 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 180, 'video_width': 320, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'features_dir': None} +2025-12-09 16:21:09,118 INFO wandb-AsyncioManager-main:886533 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 16:21:09,118 INFO wandb-AsyncioManager-main:886533 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_162104-bumjum6e/run-bumjum6e.wandb b/Meissonic/wandb/run-20251209_162104-bumjum6e/run-bumjum6e.wandb new file mode 100644 index 0000000000000000000000000000000000000000..5d89b4ae5c12f106ef197a35529f3433371d0ba3 Binary files /dev/null and b/Meissonic/wandb/run-20251209_162104-bumjum6e/run-bumjum6e.wandb differ diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/config.yaml b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10eb95f5814eadbcafadabdda05da9559013549e --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/config.yaml @@ -0,0 +1,298 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 81327e92g5gbekeuqlzq67d6vv3i4wr5: + args: + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "16" + - --video_height + - "256" + - --video_width + - "256" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "2" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12133356142592" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-09T16:21:54.995447Z" + writerId: 81327e92g5gbekeuqlzq67d6vv3i4wr5 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: null +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 16 +output_dir: + value: ./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 2 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: false +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 256 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 256 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/output.log b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d7700d99ed807c48078de496272620cb2565dc23 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/output.log @@ -0,0 +1,69 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 67.51it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 16:21:58 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6184.99it/s] +12/09/2025 16:22:00 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=5, H'=32, W'=32 +12/09/2025 16:22:00 - INFO - __main__ - Theoretical dimensions: F'=4, H'=32, W'=32 +12/09/2025 16:22:00 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:22:00 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 16:22:15 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:22:15 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:22:17 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 16:22:19 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 16:22:19 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/09/2025 16:22:19 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 16:22:19 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 16:22:26 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 16:22:26 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 16:22:26 - INFO - __main__ - Dataloader configuration: +12/09/2025 16:22:26 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 16:22:26 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 16:22:26 - INFO - __main__ - - persistent_workers: True +12/09/2025 16:22:26 - INFO - __main__ - - pin_memory: True +12/09/2025 16:22:26 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 16:22:45 - INFO - __main__ - ***** Running training ***** +12/09/2025 16:22:45 - INFO - __main__ - Num training steps = 10000 +12/09/2025 16:22:45 - INFO - __main__ - Instantaneous batch size per device = 2 +12/09/2025 16:22:45 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 16:22:45 - INFO - __main__ - Gradient Accumulation steps = 4 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.44 GiB is free. Process 887552 has 414.00 MiB memory in use. Process 887551 has 414.00 MiB memory in use. Process 887549 has 414.00 MiB memory in use. Process 887553 has 414.00 MiB memory in use. Process 887555 has 414.00 MiB memory in use. Process 887554 has 414.00 MiB memory in use. Process 887550 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 35.16 GiB memory in use. Of the allocated memory 26.13 GiB is allocated by PyTorch, and 7.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.44 GiB is free. Process 887552 has 414.00 MiB memory in use. Process 887551 has 414.00 MiB memory in use. Process 887549 has 414.00 MiB memory in use. Process 887553 has 414.00 MiB memory in use. Process 887555 has 414.00 MiB memory in use. Process 887554 has 414.00 MiB memory in use. Process 887550 has 414.00 MiB memory in use. Including non-PyTorch memory, this process has 35.16 GiB memory in use. Of the allocated memory 26.13 GiB is allocated by PyTorch, and 7.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1648, in _clean_up_worker + w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/process.py", line 149, in join + res = self._popen.wait(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait + if not wait([self.sentinel], timeout): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt: diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/requirements.txt b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c34dc3d00e9fbad98f96e4cda81008a82ddb3e77 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:21:54.995447Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "16", + "--video_height", + "256", + "--video_width", + "256", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133356142592" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "81327e92g5gbekeuqlzq67d6vv3i4wr5" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/wandb-summary.json b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c90edda74f6e5a2a5eb008d0c2f1dd0de08aa4f7 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":82,"_wandb":{"runtime":82}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-core.log b/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..08670b71a5df31d16801fb26dd09b43889e9af47 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-09T16:21:55.090725913Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpmbxuxjg6/port-887548.txt","pid":887548,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:21:55.09117577Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":887548} +{"time":"2025-12-09T16:21:55.091170115Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-887548-887812-3055066833/socket","Net":"unix"}} +{"time":"2025-12-09T16:21:55.277788729Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:21:55.285108101Z","level":"INFO","msg":"handleInformInit: received","streamId":"cecoyxuk","id":"1(@)"} +{"time":"2025-12-09T16:21:55.453430312Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"cecoyxuk","id":"1(@)"} +{"time":"2025-12-09T16:23:18.157064432Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-09T16:23:18.157157066Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-09T16:23:18.157213001Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-09T16:23:18.157169765Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-09T16:23:18.157292313Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-887548-887812-3055066833/socket","Net":"unix"}} +{"time":"2025-12-09T16:23:18.608002595Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-09T16:23:18.60802021Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-09T16:23:18.6080287Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-internal.log b/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..4e156dc6815f802f12a974277437c03b73be9e66 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-09T16:21:55.285210268Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:21:55.453210882Z","level":"INFO","msg":"stream: created new stream","id":"cecoyxuk"} +{"time":"2025-12-09T16:21:55.453305081Z","level":"INFO","msg":"handler: started","stream_id":"cecoyxuk"} +{"time":"2025-12-09T16:21:55.453422234Z","level":"INFO","msg":"stream: started","id":"cecoyxuk"} +{"time":"2025-12-09T16:21:55.453440215Z","level":"INFO","msg":"writer: started","stream_id":"cecoyxuk"} +{"time":"2025-12-09T16:21:55.453443538Z","level":"INFO","msg":"sender: started","stream_id":"cecoyxuk"} +{"time":"2025-12-09T16:23:18.157185906Z","level":"INFO","msg":"stream: closing","id":"cecoyxuk"} +{"time":"2025-12-09T16:23:18.459780648Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-09T16:23:18.605017145Z","level":"INFO","msg":"handler: closed","stream_id":"cecoyxuk"} +{"time":"2025-12-09T16:23:18.605075297Z","level":"INFO","msg":"sender: closed","stream_id":"cecoyxuk"} +{"time":"2025-12-09T16:23:18.605081119Z","level":"INFO","msg":"stream: closed","id":"cecoyxuk"} diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug.log b/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3660bb64e0d22dfffb516098d4c640ffc7b439eb --- /dev/null +++ b/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-09 16:21:55,000 INFO MainThread:887548 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:21:55,000 INFO MainThread:887548 [wandb_setup.py:_flush():80] Configure stats pid to 887548 +2025-12-09 16:21:55,000 INFO MainThread:887548 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:21:55,000 INFO MainThread:887548 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:21:55,000 INFO MainThread:887548 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:21:55,000 INFO MainThread:887548 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug.log +2025-12-09 16:21:55,001 INFO MainThread:887548 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_162154-cecoyxuk/logs/debug-internal.log +2025-12-09 16:21:55,001 INFO MainThread:887548 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:21:55,001 INFO MainThread:887548 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:21:55,001 INFO MainThread:887548 [wandb_init.py:init():889] starting backend +2025-12-09 16:21:55,278 INFO MainThread:887548 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:21:55,283 INFO MainThread:887548 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:21:55,285 INFO MainThread:887548 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:21:55,291 INFO MainThread:887548 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:21:55,636 INFO MainThread:887548 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:21:55,819 INFO MainThread:887548 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:21:55,819 INFO MainThread:887548 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:21:55,819 INFO MainThread:887548 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:21:55,819 INFO MainThread:887548 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:21:55,822 INFO MainThread:887548 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:21:55,823 INFO MainThread:887548 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'features_dir': None} +2025-12-09 16:23:18,157 INFO wandb-AsyncioManager-main:887548 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-09 16:23:18,157 INFO wandb-AsyncioManager-main:887548 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251209_162154-cecoyxuk/run-cecoyxuk.wandb b/Meissonic/wandb/run-20251209_162154-cecoyxuk/run-cecoyxuk.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7d813548e478df7f4f2d787ed24ccb9fcd519a26 Binary files /dev/null and b/Meissonic/wandb/run-20251209_162154-cecoyxuk/run-cecoyxuk.wandb differ diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/files/output.log b/Meissonic/wandb/run-20251209_162337-uv3abozu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d1a84bc288c1c67690fe139d73be7c59287cf6bf --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/files/output.log @@ -0,0 +1,47 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 68.94it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/09/2025 16:23:40 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4862.56it/s] +12/09/2025 16:23:42 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=5, H'=16, W'=16 +12/09/2025 16:23:42 - INFO - __main__ - Theoretical dimensions: F'=4, H'=16, W'=16 +12/09/2025 16:23:42 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:23:42 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/09/2025 16:23:58 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:23:58 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/09/2025 16:23:59 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/09/2025 16:24:02 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/09/2025 16:24:02 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/09/2025 16:24:02 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/09/2025 16:24:02 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/09/2025 16:24:09 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/09/2025 16:24:09 - INFO - train.dataset_utils - Using decord for video loading +12/09/2025 16:24:09 - INFO - __main__ - Dataloader configuration: +12/09/2025 16:24:09 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/09/2025 16:24:09 - INFO - __main__ - - prefetch_factor: 2 +12/09/2025 16:24:09 - INFO - __main__ - - persistent_workers: True +12/09/2025 16:24:09 - INFO - __main__ - - pin_memory: True +12/09/2025 16:24:09 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/09/2025 16:24:26 - INFO - __main__ - ***** Running training ***** +12/09/2025 16:24:26 - INFO - __main__ - Num training steps = 10000 +12/09/2025 16:24:26 - INFO - __main__ - Instantaneous batch size per device = 2 +12/09/2025 16:24:26 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/09/2025 16:24:26 - INFO - __main__ - Gradient Accumulation steps = 4 +12/09/2025 16:25:16 - INFO - __main__ - Step: 10 Loss: 11.0758 LR: 0.000060 +12/09/2025 16:25:49 - INFO - __main__ - Step: 20 Loss: 11.0759 LR: 0.000060 +12/09/2025 16:26:23 - INFO - __main__ - Step: 30 Loss: 11.0732 LR: 0.000060 +12/09/2025 16:26:57 - INFO - __main__ - Step: 40 Loss: 11.0707 LR: 0.000060 +12/09/2025 16:27:31 - INFO - __main__ - Step: 50 Loss: 11.0721 LR: 0.000060 +12/09/2025 16:28:04 - INFO - __main__ - Step: 60 Loss: 11.0691 LR: 0.000060 +12/09/2025 16:28:39 - INFO - __main__ - Step: 70 Loss: 11.0651 LR: 0.000060 +12/09/2025 16:29:15 - INFO - __main__ - Step: 80 Loss: 11.0624 LR: 0.000060 +12/09/2025 16:29:51 - INFO - __main__ - Step: 90 Loss: 11.0556 LR: 0.000060 +12/09/2025 16:30:26 - INFO - __main__ - Step: 100 Loss: 11.0429 LR: 0.000060 +12/09/2025 16:31:02 - INFO - __main__ - Step: 110 Loss: 11.0275 LR: 0.000060 +12/09/2025 16:31:38 - INFO - __main__ - Step: 120 Loss: 11.0152 LR: 0.000060 +12/09/2025 16:32:13 - INFO - __main__ - Step: 130 Loss: 10.9992 LR: 0.000060 +12/09/2025 16:32:47 - INFO - __main__ - Step: 140 Loss: 10.9513 LR: 0.000060 +12/09/2025 16:33:22 - INFO - __main__ - Step: 150 Loss: 10.9467 LR: 0.000060 +12/09/2025 16:33:55 - INFO - __main__ - Step: 160 Loss: 10.9003 LR: 0.000060 +12/09/2025 16:34:29 - INFO - __main__ - Step: 170 Loss: 10.9009 LR: 0.000060 +12/09/2025 16:35:03 - INFO - __main__ - Step: 180 Loss: 10.8768 LR: 0.000060 +12/09/2025 16:35:36 - INFO - __main__ - Step: 190 Loss: 10.8298 LR: 0.000060 diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/files/requirements.txt b/Meissonic/wandb/run-20251209_162337-uv3abozu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/files/wandb-metadata.json b/Meissonic/wandb/run-20251209_162337-uv3abozu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..636026fed6fa8311eb1aee1b7514b4fb81dbed63 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-09T16:23:37.188415Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "16", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133356269568" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "c4qpena2ayixc8utojqhldz85b5zaowt" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-core.log b/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..8143ddfc546ad8098d314d3f0af17dbb3c68581a --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-09T16:23:37.257637154Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpi9umj7zm/port-896751.txt","pid":896751,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-09T16:23:37.258144523Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":896751} +{"time":"2025-12-09T16:23:37.258155302Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-896751-897022-3529873079/socket","Net":"unix"}} +{"time":"2025-12-09T16:23:37.444237022Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-09T16:23:37.450217059Z","level":"INFO","msg":"handleInformInit: received","streamId":"uv3abozu","id":"1(@)"} +{"time":"2025-12-09T16:23:37.620567422Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"uv3abozu","id":"1(@)"} +{"time":"2025-12-09T16:36:38.972940398Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-internal.log b/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2554b4b22c2202bda8508803b3f960f70cdc9ce1 --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-09T16:23:37.450366978Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-09T16:23:37.620280029Z","level":"INFO","msg":"stream: created new stream","id":"uv3abozu"} +{"time":"2025-12-09T16:23:37.620379333Z","level":"INFO","msg":"handler: started","stream_id":"uv3abozu"} +{"time":"2025-12-09T16:23:37.620559096Z","level":"INFO","msg":"stream: started","id":"uv3abozu"} +{"time":"2025-12-09T16:23:37.620573354Z","level":"INFO","msg":"sender: started","stream_id":"uv3abozu"} +{"time":"2025-12-09T16:23:37.620576443Z","level":"INFO","msg":"writer: started","stream_id":"uv3abozu"} diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug.log b/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8427b90899ae5ef2a1e6e1b83b7bc369791d8d7f --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_setup.py:_flush():80] Configure stats pid to 896751 +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug.log +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251209_162337-uv3abozu/logs/debug-internal.log +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_init.py:init():841] calling init triggers +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-09 16:23:37,191 INFO MainThread:896751 [wandb_init.py:init():889] starting backend +2025-12-09 16:23:37,444 INFO MainThread:896751 [wandb_init.py:init():892] sending inform_init request +2025-12-09 16:23:37,448 INFO MainThread:896751 [wandb_init.py:init():900] backend started and connected +2025-12-09 16:23:37,450 INFO MainThread:896751 [wandb_init.py:init():970] updated telemetry +2025-12-09 16:23:37,454 INFO MainThread:896751 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-09 16:23:37,893 INFO MainThread:896751 [wandb_init.py:init():1041] starting run threads in backend +2025-12-09 16:23:38,016 INFO MainThread:896751 [wandb_run.py:_console_start():2521] atexit reg +2025-12-09 16:23:38,016 INFO MainThread:896751 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-09 16:23:38,016 INFO MainThread:896751 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-09 16:23:38,017 INFO MainThread:896751 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-09 16:23:38,019 INFO MainThread:896751 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-09 16:23:38,020 INFO MainThread:896751 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251209_162337-uv3abozu/run-uv3abozu.wandb b/Meissonic/wandb/run-20251209_162337-uv3abozu/run-uv3abozu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4d77d3854a3dde8c50e14d882d2d8fe7690dea6b --- /dev/null +++ b/Meissonic/wandb/run-20251209_162337-uv3abozu/run-uv3abozu.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46dd39463ae72c061a5c24857a9368679ddd358bdb34c5750ed1f99cfbb4e7b4 +size 196608 diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/output.log b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4f9c648dc629ce4ba5f330848987b53066014084 --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/output.log @@ -0,0 +1,47 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 66.77it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/10/2025 03:03:29 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3535.66it/s] +12/10/2025 03:03:31 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=5, H'=16, W'=16 +12/10/2025 03:03:31 - INFO - __main__ - Theoretical dimensions: F'=4, H'=16, W'=16 +12/10/2025 03:03:31 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:03:31 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 03:03:47 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:03:47 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:03:50 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 03:03:54 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 03:03:54 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 03:03:54 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 03:03:54 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 03:04:01 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/10/2025 03:04:01 - INFO - train.dataset_utils - Using decord for video loading +12/10/2025 03:04:01 - INFO - __main__ - Dataloader configuration: +12/10/2025 03:04:01 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 03:04:01 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 03:04:01 - INFO - __main__ - - persistent_workers: True +12/10/2025 03:04:01 - INFO - __main__ - - pin_memory: True +12/10/2025 03:04:01 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 03:04:16 - INFO - __main__ - ***** Running training ***** +12/10/2025 03:04:16 - INFO - __main__ - Num training steps = 10000 +12/10/2025 03:04:16 - INFO - __main__ - Instantaneous batch size per device = 2 +12/10/2025 03:04:16 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/10/2025 03:04:16 - INFO - __main__ - Gradient Accumulation steps = 4 +12/10/2025 03:05:04 - INFO - __main__ - Step: 10 Loss: 11.0757 LR: 0.000060 +12/10/2025 03:05:37 - INFO - __main__ - Step: 20 Loss: 11.0758 LR: 0.000060 +12/10/2025 03:06:10 - INFO - __main__ - Step: 30 Loss: 11.0730 LR: 0.000060 +12/10/2025 03:06:43 - INFO - __main__ - Step: 40 Loss: 11.0716 LR: 0.000060 +12/10/2025 03:07:18 - INFO - __main__ - Step: 50 Loss: 11.0721 LR: 0.000060 +12/10/2025 03:07:51 - INFO - __main__ - Step: 60 Loss: 11.0697 LR: 0.000060 +12/10/2025 03:08:26 - INFO - __main__ - Step: 70 Loss: 11.0646 LR: 0.000060 +12/10/2025 03:09:00 - INFO - __main__ - Step: 80 Loss: 11.0623 LR: 0.000060 +12/10/2025 03:09:35 - INFO - __main__ - Step: 90 Loss: 11.0554 LR: 0.000060 +12/10/2025 03:10:10 - INFO - __main__ - Step: 100 Loss: 11.0420 LR: 0.000060 +12/10/2025 03:10:44 - INFO - __main__ - Step: 110 Loss: 11.0280 LR: 0.000060 +12/10/2025 03:11:19 - INFO - __main__ - Step: 120 Loss: 11.0138 LR: 0.000060 +12/10/2025 03:11:55 - INFO - __main__ - Step: 130 Loss: 11.0001 LR: 0.000060 +12/10/2025 03:12:29 - INFO - __main__ - Step: 140 Loss: 10.9526 LR: 0.000060 +12/10/2025 03:13:03 - INFO - __main__ - Step: 150 Loss: 10.9456 LR: 0.000060 +12/10/2025 03:13:38 - INFO - __main__ - Step: 160 Loss: 10.8995 LR: 0.000060 +12/10/2025 03:14:11 - INFO - __main__ - Step: 170 Loss: 10.9012 LR: 0.000060 +12/10/2025 03:14:46 - INFO - __main__ - Step: 180 Loss: 10.8755 LR: 0.000060 +12/10/2025 03:15:21 - INFO - __main__ - Step: 190 Loss: 10.8287 LR: 0.000060 diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/requirements.txt b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e467de2b6e445c266f6430acaf3c7ffef6c4bdaf --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T03:03:25.964354Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "16", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133356830720" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "ielhwboly7v6fm7l89mdg72fej61il8f" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-core.log b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..33894bd41e774fcf74f6ac85769a02956852c006 --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-10T03:03:26.03254978Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpn0kc07t9/port-1508493.txt","pid":1508493,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T03:03:26.033003993Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1508493} +{"time":"2025-12-10T03:03:26.033013765Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1508493-1508740-1211382152/socket","Net":"unix"}} +{"time":"2025-12-10T03:03:26.21958791Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T03:03:26.225652825Z","level":"INFO","msg":"handleInformInit: received","streamId":"gkrz1ykg","id":"1(@)"} +{"time":"2025-12-10T03:03:26.396922559Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"gkrz1ykg","id":"1(@)"} +{"time":"2025-12-10T03:16:24.528540964Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-internal.log b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..714399f6694aa425751aceb97f1071fc778de206 --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-10T03:03:26.225797144Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T03:03:26.396690435Z","level":"INFO","msg":"stream: created new stream","id":"gkrz1ykg"} +{"time":"2025-12-10T03:03:26.396798298Z","level":"INFO","msg":"handler: started","stream_id":"gkrz1ykg"} +{"time":"2025-12-10T03:03:26.396913302Z","level":"INFO","msg":"stream: started","id":"gkrz1ykg"} +{"time":"2025-12-10T03:03:26.396933074Z","level":"INFO","msg":"writer: started","stream_id":"gkrz1ykg"} +{"time":"2025-12-10T03:03:26.396935917Z","level":"INFO","msg":"sender: started","stream_id":"gkrz1ykg"} diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug.log b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..85c4623183dd924370dbc28ef2bf38a2ba142c76 --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_setup.py:_flush():80] Configure stats pid to 1508493 +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug.log +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_030325-gkrz1ykg/logs/debug-internal.log +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_init.py:init():841] calling init triggers +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 03:03:25,967 INFO MainThread:1508493 [wandb_init.py:init():889] starting backend +2025-12-10 03:03:26,219 INFO MainThread:1508493 [wandb_init.py:init():892] sending inform_init request +2025-12-10 03:03:26,224 INFO MainThread:1508493 [wandb_init.py:init():900] backend started and connected +2025-12-10 03:03:26,225 INFO MainThread:1508493 [wandb_init.py:init():970] updated telemetry +2025-12-10 03:03:26,229 INFO MainThread:1508493 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 03:03:26,692 INFO MainThread:1508493 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 03:03:26,875 INFO MainThread:1508493 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 03:03:26,875 INFO MainThread:1508493 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 03:03:26,875 INFO MainThread:1508493 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 03:03:26,875 INFO MainThread:1508493 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 03:03:26,878 INFO MainThread:1508493 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 03:03:26,879 INFO MainThread:1508493 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 16, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251210_030325-gkrz1ykg/run-gkrz1ykg.wandb b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/run-gkrz1ykg.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e19bc005dc870403a3d14076efdda2679e154ce0 --- /dev/null +++ b/Meissonic/wandb/run-20251210_030325-gkrz1ykg/run-gkrz1ykg.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e29557a130e180ecbfc8f8b4eae69e163cce17814b944aa9a281bfd86671e3b +size 196608 diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/files/output.log b/Meissonic/wandb/run-20251210_032745-o7so78o8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9117a0dc2579901f049330a882919a2b1e6dc7bc --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/files/output.log @@ -0,0 +1,47 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 71.34it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/10/2025 03:27:48 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6659.14it/s] +12/10/2025 03:27:50 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=5, H'=16, W'=16 +12/10/2025 03:27:50 - INFO - __main__ - Theoretical dimensions: F'=4, H'=16, W'=16 +12/10/2025 03:27:50 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:27:50 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 03:28:05 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:28:05 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:28:07 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 03:28:09 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 03:28:09 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 03:28:09 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 03:28:09 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 03:28:16 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/10/2025 03:28:16 - INFO - train.dataset_utils - Using decord for video loading +12/10/2025 03:28:16 - INFO - __main__ - Dataloader configuration: +12/10/2025 03:28:16 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 03:28:16 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 03:28:16 - INFO - __main__ - - persistent_workers: True +12/10/2025 03:28:16 - INFO - __main__ - - pin_memory: True +12/10/2025 03:28:16 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 03:28:34 - INFO - __main__ - ***** Running training ***** +12/10/2025 03:28:34 - INFO - __main__ - Num training steps = 10000 +12/10/2025 03:28:34 - INFO - __main__ - Instantaneous batch size per device = 2 +12/10/2025 03:28:34 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/10/2025 03:28:34 - INFO - __main__ - Gradient Accumulation steps = 4 +12/10/2025 03:29:23 - INFO - __main__ - Step: 10 Loss: 11.0751 LR: 0.000060 +12/10/2025 03:29:58 - INFO - __main__ - Step: 20 Loss: 11.0745 LR: 0.000060 +12/10/2025 03:30:33 - INFO - __main__ - Step: 30 Loss: 11.0720 LR: 0.000060 +12/10/2025 03:31:07 - INFO - __main__ - Step: 40 Loss: 11.0713 LR: 0.000060 +12/10/2025 03:31:42 - INFO - __main__ - Step: 50 Loss: 11.0713 LR: 0.000060 +12/10/2025 03:32:15 - INFO - __main__ - Step: 60 Loss: 11.0692 LR: 0.000060 +12/10/2025 03:32:49 - INFO - __main__ - Step: 70 Loss: 11.0663 LR: 0.000060 +12/10/2025 03:33:24 - INFO - __main__ - Step: 80 Loss: 11.0617 LR: 0.000060 +12/10/2025 03:33:59 - INFO - __main__ - Step: 90 Loss: 11.0555 LR: 0.000060 +12/10/2025 03:34:34 - INFO - __main__ - Step: 100 Loss: 11.0448 LR: 0.000060 +12/10/2025 03:35:08 - INFO - __main__ - Step: 110 Loss: 11.0327 LR: 0.000060 +12/10/2025 03:35:43 - INFO - __main__ - Step: 120 Loss: 11.0147 LR: 0.000060 +12/10/2025 03:36:19 - INFO - __main__ - Step: 130 Loss: 10.9994 LR: 0.000060 +12/10/2025 03:36:54 - INFO - __main__ - Step: 140 Loss: 10.9611 LR: 0.000060 +12/10/2025 03:37:28 - INFO - __main__ - Step: 150 Loss: 10.9458 LR: 0.000060 +12/10/2025 03:38:03 - INFO - __main__ - Step: 160 Loss: 10.9011 LR: 0.000060 +12/10/2025 03:38:37 - INFO - __main__ - Step: 170 Loss: 10.9101 LR: 0.000060 +12/10/2025 03:39:12 - INFO - __main__ - Step: 180 Loss: 10.8789 LR: 0.000060 +12/10/2025 03:39:47 - INFO - __main__ - Step: 190 Loss: 10.8317 LR: 0.000060 diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/files/requirements.txt b/Meissonic/wandb/run-20251210_032745-o7so78o8/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_032745-o7so78o8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9fa7d522f0916775f6e56b0f75b02c88b41403de --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T03:27:45.069546Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133357502464" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "az0nthtl1eiu5tz12udubozi6ufhx0at" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-core.log b/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..404de629459ea1ec9dacd8fe83993b9e719723fe --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-10T03:27:45.136792287Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpo0exnb9c/port-1749619.txt","pid":1749619,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T03:27:45.13733044Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1749619} +{"time":"2025-12-10T03:27:45.137306574Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1749619-1749884-2107733221/socket","Net":"unix"}} +{"time":"2025-12-10T03:27:45.323961753Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T03:27:45.329819545Z","level":"INFO","msg":"handleInformInit: received","streamId":"o7so78o8","id":"1(@)"} +{"time":"2025-12-10T03:27:45.496539195Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"o7so78o8","id":"1(@)"} +{"time":"2025-12-10T03:40:49.793952726Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-internal.log b/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c089e97c3d6ca645dfec69f355ea2c55f39b5e38 --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-10T03:27:45.329946027Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T03:27:45.496337243Z","level":"INFO","msg":"stream: created new stream","id":"o7so78o8"} +{"time":"2025-12-10T03:27:45.496427061Z","level":"INFO","msg":"handler: started","stream_id":"o7so78o8"} +{"time":"2025-12-10T03:27:45.496531545Z","level":"INFO","msg":"stream: started","id":"o7so78o8"} +{"time":"2025-12-10T03:27:45.496553803Z","level":"INFO","msg":"writer: started","stream_id":"o7so78o8"} +{"time":"2025-12-10T03:27:45.496557065Z","level":"INFO","msg":"sender: started","stream_id":"o7so78o8"} diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug.log b/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d39907f0f9df11144958d668f0e58be33b3c034b --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_setup.py:_flush():80] Configure stats pid to 1749619 +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug.log +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_032745-o7so78o8/logs/debug-internal.log +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_init.py:init():841] calling init triggers +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 03:27:45,072 INFO MainThread:1749619 [wandb_init.py:init():889] starting backend +2025-12-10 03:27:45,324 INFO MainThread:1749619 [wandb_init.py:init():892] sending inform_init request +2025-12-10 03:27:45,328 INFO MainThread:1749619 [wandb_init.py:init():900] backend started and connected +2025-12-10 03:27:45,329 INFO MainThread:1749619 [wandb_init.py:init():970] updated telemetry +2025-12-10 03:27:45,334 INFO MainThread:1749619 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 03:27:45,776 INFO MainThread:1749619 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 03:27:45,899 INFO MainThread:1749619 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 03:27:45,899 INFO MainThread:1749619 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 03:27:45,899 INFO MainThread:1749619 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 03:27:45,899 INFO MainThread:1749619 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 03:27:45,902 INFO MainThread:1749619 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 03:27:45,903 INFO MainThread:1749619 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251210_032745-o7so78o8/run-o7so78o8.wandb b/Meissonic/wandb/run-20251210_032745-o7so78o8/run-o7so78o8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..0dbbecbef1043d72338934caa2db97ec7ef5ddcd --- /dev/null +++ b/Meissonic/wandb/run-20251210_032745-o7so78o8/run-o7so78o8.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a6554e67bd7fe4a111c314aea3dfac25180cbc585b55d30390eb0bb11251cda +size 196608 diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/media/images/generated_videos_first_frame_500_081f6716317316010e23.png b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/media/images/generated_videos_first_frame_500_081f6716317316010e23.png new file mode 100644 index 0000000000000000000000000000000000000000..db98e10b357391b0c1aeac2daace217dee9d4ca2 Binary files /dev/null and b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/media/images/generated_videos_first_frame_500_081f6716317316010e23.png differ diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/media/images/generated_videos_first_frame_500_da9d6c1a97c444d47f3f.png b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/media/images/generated_videos_first_frame_500_da9d6c1a97c444d47f3f.png new file mode 100644 index 0000000000000000000000000000000000000000..3ffe531522b08c3914904d5d356469e67b7d63a7 Binary files /dev/null and b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/media/images/generated_videos_first_frame_500_da9d6c1a97c444d47f3f.png differ diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/output.log b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..43d2d89c79a656dccde7513fc072e25a0db8bee2 --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/output.log @@ -0,0 +1,90 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 67.78it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +12/10/2025 03:53:39 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096) +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5005.14it/s] +12/10/2025 03:53:41 - INFO - __main__ - Actual compressed dimensions from tokenizer: F'=5, H'=16, W'=16 +12/10/2025 03:53:41 - INFO - __main__ - Theoretical dimensions: F'=4, H'=16, W'=16 +12/10/2025 03:53:41 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:53:41 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 03:53:57 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:53:57 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 03:53:59 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 03:54:01 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 03:54:01 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 03:54:01 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 03:54:01 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 03:54:08 - INFO - train.dataset_utils - Loaded 1019957 video entries from /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +12/10/2025 03:54:08 - INFO - train.dataset_utils - Using decord for video loading +12/10/2025 03:54:08 - INFO - __main__ - Dataloader configuration: +12/10/2025 03:54:08 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 03:54:08 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 03:54:08 - INFO - __main__ - - persistent_workers: True +12/10/2025 03:54:08 - INFO - __main__ - - pin_memory: True +12/10/2025 03:54:08 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 03:54:25 - INFO - __main__ - ***** Running training ***** +12/10/2025 03:54:25 - INFO - __main__ - Num training steps = 10000 +12/10/2025 03:54:25 - INFO - __main__ - Instantaneous batch size per device = 2 +12/10/2025 03:54:25 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/10/2025 03:54:25 - INFO - __main__ - Gradient Accumulation steps = 4 +12/10/2025 03:55:13 - INFO - __main__ - Step: 10 Loss: 11.0745 LR: 0.000060 +12/10/2025 03:55:48 - INFO - __main__ - Step: 20 Loss: 11.0750 LR: 0.000060 +12/10/2025 03:56:21 - INFO - __main__ - Step: 30 Loss: 11.0715 LR: 0.000060 +12/10/2025 03:56:55 - INFO - __main__ - Step: 40 Loss: 11.0719 LR: 0.000060 +12/10/2025 03:57:29 - INFO - __main__ - Step: 50 Loss: 11.0712 LR: 0.000060 +12/10/2025 03:58:03 - INFO - __main__ - Step: 60 Loss: 11.0689 LR: 0.000060 +12/10/2025 03:58:37 - INFO - __main__ - Step: 70 Loss: 11.0658 LR: 0.000060 +12/10/2025 03:59:11 - INFO - __main__ - Step: 80 Loss: 11.0614 LR: 0.000060 +12/10/2025 03:59:46 - INFO - __main__ - Step: 90 Loss: 11.0535 LR: 0.000060 +12/10/2025 04:00:20 - INFO - __main__ - Step: 100 Loss: 11.0433 LR: 0.000060 +12/10/2025 04:00:54 - INFO - __main__ - Step: 110 Loss: 11.0295 LR: 0.000060 +12/10/2025 04:01:29 - INFO - __main__ - Step: 120 Loss: 11.0128 LR: 0.000060 +12/10/2025 04:02:03 - INFO - __main__ - Step: 130 Loss: 10.9998 LR: 0.000060 +12/10/2025 04:02:37 - INFO - __main__ - Step: 140 Loss: 10.9615 LR: 0.000060 +12/10/2025 04:03:11 - INFO - __main__ - Step: 150 Loss: 10.9471 LR: 0.000060 +12/10/2025 04:03:47 - INFO - __main__ - Step: 160 Loss: 10.8990 LR: 0.000060 +12/10/2025 04:04:21 - INFO - __main__ - Step: 170 Loss: 10.9111 LR: 0.000060 +12/10/2025 04:04:55 - INFO - __main__ - Step: 180 Loss: 10.8777 LR: 0.000060 +12/10/2025 04:05:30 - INFO - __main__ - Step: 190 Loss: 10.8318 LR: 0.000060 +12/10/2025 04:06:05 - INFO - __main__ - Step: 200 Loss: 10.8156 LR: 0.000060 +12/10/2025 04:06:40 - INFO - __main__ - Step: 210 Loss: 10.7797 LR: 0.000060 +12/10/2025 04:07:15 - INFO - __main__ - Step: 220 Loss: 10.7180 LR: 0.000060 +12/10/2025 04:07:49 - INFO - __main__ - Step: 230 Loss: 10.7245 LR: 0.000060 +12/10/2025 04:08:25 - INFO - __main__ - Step: 240 Loss: 10.7025 LR: 0.000060 +12/10/2025 04:08:59 - INFO - __main__ - Step: 250 Loss: 10.6934 LR: 0.000060 +12/10/2025 04:09:35 - INFO - __main__ - Step: 260 Loss: 10.6302 LR: 0.000060 +12/10/2025 04:10:09 - INFO - __main__ - Step: 270 Loss: 10.6235 LR: 0.000060 +12/10/2025 04:10:44 - INFO - __main__ - Step: 280 Loss: 10.5938 LR: 0.000060 +12/10/2025 04:11:19 - INFO - __main__ - Step: 290 Loss: 10.6458 LR: 0.000060 +12/10/2025 04:11:54 - INFO - __main__ - Step: 300 Loss: 10.5509 LR: 0.000060 +12/10/2025 04:12:28 - INFO - __main__ - Step: 310 Loss: 10.5721 LR: 0.000060 +12/10/2025 04:13:03 - INFO - __main__ - Step: 320 Loss: 10.5578 LR: 0.000060 +12/10/2025 04:13:37 - INFO - __main__ - Step: 330 Loss: 10.5018 LR: 0.000060 +12/10/2025 04:14:12 - INFO - __main__ - Step: 340 Loss: 10.5305 LR: 0.000060 +12/10/2025 04:14:46 - INFO - __main__ - Step: 350 Loss: 10.5309 LR: 0.000060 +12/10/2025 04:15:20 - INFO - __main__ - Step: 360 Loss: 10.5047 LR: 0.000060 +12/10/2025 04:15:55 - INFO - __main__ - Step: 370 Loss: 10.4622 LR: 0.000060 +12/10/2025 04:16:30 - INFO - __main__ - Step: 380 Loss: 10.5089 LR: 0.000060 +12/10/2025 04:17:04 - INFO - __main__ - Step: 390 Loss: 10.4511 LR: 0.000060 +12/10/2025 04:17:39 - INFO - __main__ - Step: 400 Loss: 10.4623 LR: 0.000060 +12/10/2025 04:18:14 - INFO - __main__ - Step: 410 Loss: 10.5111 LR: 0.000060 +12/10/2025 04:18:47 - INFO - __main__ - Step: 420 Loss: 10.4446 LR: 0.000060 +12/10/2025 04:19:23 - INFO - __main__ - Step: 430 Loss: 10.4794 LR: 0.000060 +12/10/2025 04:19:57 - INFO - __main__ - Step: 440 Loss: 10.4818 LR: 0.000060 +12/10/2025 04:20:32 - INFO - __main__ - Step: 450 Loss: 10.4768 LR: 0.000060 +12/10/2025 04:21:07 - INFO - __main__ - Step: 460 Loss: 10.4552 LR: 0.000060 +12/10/2025 04:21:42 - INFO - __main__ - Step: 470 Loss: 10.4234 LR: 0.000060 +12/10/2025 04:22:17 - INFO - __main__ - Step: 480 Loss: 10.4355 LR: 0.000060 +12/10/2025 04:22:52 - INFO - __main__ - Step: 490 Loss: 10.4183 LR: 0.000060 +12/10/2025 04:23:26 - INFO - __main__ - Step: 500 Loss: 10.5091 LR: 0.000060 +12/10/2025 04:23:26 - INFO - accelerate.accelerator - Saving current state to output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 04:23:35 - INFO - accelerate.checkpointing - Optimizer state saved in output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 04:23:35 - INFO - accelerate.checkpointing - Scheduler state saved in output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 04:23:35 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 04:23:35 - INFO - accelerate.checkpointing - Random states saved in output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 04:23:35 - INFO - __main__ - Saved state to output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 04:23:35 - INFO - __main__ - Generating videos for validation... +12/10/2025 04:23:35 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.04it/s] +12/10/2025 04:23:42 - INFO - __main__ - Validation videos saved to ./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio +12/10/2025 04:24:19 - INFO - __main__ - Step: 510 Loss: 10.3847 LR: 0.000060 +12/10/2025 04:24:54 - INFO - __main__ - Step: 520 Loss: 10.4455 LR: 0.000060 diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/requirements.txt b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1a9049e1a096feb538e1e49f30cee5b508b7e2be --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/files/wandb-metadata.json @@ -0,0 +1,153 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T03:53:36.383134Z", + "args": [ + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12133357961216" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "pbm2fbj3vcjuj9huqsanqzcnp62wayv1" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-core.log b/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..d6ae206e1dba1ca64d735aa67296609f98bfb5e9 --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-10T03:53:36.452090255Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpte7_g0so/port-1991718.txt","pid":1991718,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T03:53:36.452554454Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1991718} +{"time":"2025-12-10T03:53:36.452557571Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1991718-1991975-1642945423/socket","Net":"unix"}} +{"time":"2025-12-10T03:53:36.638650796Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T03:53:36.644680584Z","level":"INFO","msg":"handleInformInit: received","streamId":"u8db4xs3","id":"1(@)"} +{"time":"2025-12-10T03:53:36.814688663Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"u8db4xs3","id":"1(@)"} +{"time":"2025-12-10T04:25:22.68593366Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-internal.log b/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..82cac581960431a2f84898ea3cc0f7642f996eb9 --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-10T03:53:36.644840796Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T03:53:36.814471544Z","level":"INFO","msg":"stream: created new stream","id":"u8db4xs3"} +{"time":"2025-12-10T03:53:36.814562621Z","level":"INFO","msg":"handler: started","stream_id":"u8db4xs3"} +{"time":"2025-12-10T03:53:36.814680984Z","level":"INFO","msg":"stream: started","id":"u8db4xs3"} +{"time":"2025-12-10T03:53:36.814691619Z","level":"INFO","msg":"writer: started","stream_id":"u8db4xs3"} +{"time":"2025-12-10T03:53:36.814703071Z","level":"INFO","msg":"sender: started","stream_id":"u8db4xs3"} diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug.log b/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..cd5b0371dcce89beab71f1b761b790a78e4b34ef --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_setup.py:_flush():80] Configure stats pid to 1991718 +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug.log +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_035336-u8db4xs3/logs/debug-internal.log +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_init.py:init():841] calling init triggers +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 03:53:36,386 INFO MainThread:1991718 [wandb_init.py:init():889] starting backend +2025-12-10 03:53:36,638 INFO MainThread:1991718 [wandb_init.py:init():892] sending inform_init request +2025-12-10 03:53:36,643 INFO MainThread:1991718 [wandb_init.py:init():900] backend started and connected +2025-12-10 03:53:36,644 INFO MainThread:1991718 [wandb_init.py:init():970] updated telemetry +2025-12-10 03:53:36,648 INFO MainThread:1991718 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 03:53:37,192 INFO MainThread:1991718 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 03:53:37,316 INFO MainThread:1991718 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 03:53:37,316 INFO MainThread:1991718 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 03:53:37,316 INFO MainThread:1991718 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 03:53:37,316 INFO MainThread:1991718 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 03:53:37,319 INFO MainThread:1991718 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 03:53:37,319 INFO MainThread:1991718 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_180x320_16f_2bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'features_dir': None} diff --git a/Meissonic/wandb/run-20251210_035336-u8db4xs3/run-u8db4xs3.wandb b/Meissonic/wandb/run-20251210_035336-u8db4xs3/run-u8db4xs3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9ec9fdb344bf578e41379c0e5beda1d214952351 --- /dev/null +++ b/Meissonic/wandb/run-20251210_035336-u8db4xs3/run-u8db4xs3.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65adf28c4399ff3bde9d3f9930b1b289b78a3873e5979acb6f690fb5a2002b12 +size 491520 diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/files/config.yaml b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..71664e191559314ce38d248ef90ce733267b6830 --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/config.yaml @@ -0,0 +1,303 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + fpeewl20f52v8o7trkfvthix59lgdpw0: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "2" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12139077726208" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T04:27:47.773230Z" + writerId: fpeewl20f52v8o7trkfvthix59lgdpw0 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 2 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/files/output.log b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..65584761c8dfee33cd1f4bb4610c1407b5523e53 --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/output.log @@ -0,0 +1,250 @@ +12/10/2025 04:27:48 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 04:27:48 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 04:27:48 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 04:27:48 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 04:27:48 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 04:27:48 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 04:27:48 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 04:27:48 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:27:48 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 04:28:04 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:28:04 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:28:06 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 04:28:08 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 04:28:08 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 04:28:08 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 04:28:08 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 04:28:08 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 04:28:08 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 04:28:08 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 04:28:08 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 04:28:08 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 04:28:08 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 04:28:08 - INFO - __main__ - Dataloader configuration: +12/10/2025 04:28:08 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 04:28:08 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 04:28:08 - INFO - __main__ - - persistent_workers: True +12/10/2025 04:28:08 - INFO - __main__ - - pin_memory: True +12/10/2025 04:28:08 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 04:28:10 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 04:28:10 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/10/2025 04:28:10 - INFO - __main__ - ***** Running training ***** +12/10/2025 04:28:10 - INFO - __main__ - Num training steps = 10000 +12/10/2025 04:28:10 - INFO - __main__ - Instantaneous batch size per device = 2 +12/10/2025 04:28:10 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 64 +12/10/2025 04:28:10 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([2, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([2, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 04:28:11 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +[DEBUG] video_tokens: shape=torch.Size([2, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([2, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 04:28:15 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +[DEBUG] video_tokens: shape=torch.Size([2, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([2, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 04:28:15 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +[DEBUG] video_tokens: shape=torch.Size([2, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([2, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 04:28:16 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:17 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:18 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:18 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:19 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:19 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:20 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:20 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:21 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:21 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:22 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:23 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:23 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:24 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:24 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:25 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:25 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:26 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:26 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:27 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:27 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:28 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:29 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:29 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:30 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:30 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:31 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:31 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:32 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:32 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:33 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:34 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:34 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:35 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:35 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:36 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:36 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:37 - INFO - __main__ - Step: 10 Loss: 11.0752 LR: 0.000060 +12/10/2025 04:28:37 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:37 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:38 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:38 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:39 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:39 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:40 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:40 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:41 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:42 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:42 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:43 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:43 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:44 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:44 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:45 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:46 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:46 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:46 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:47 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:48 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:48 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:49 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:49 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:50 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:50 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:51 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:51 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:52 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:52 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:53 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:54 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:54 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:55 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:55 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:56 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:56 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:57 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:57 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:57 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:58 - INFO - __main__ - Step: 20 Loss: 11.0732 LR: 0.000060 +12/10/2025 04:28:58 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:59 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:28:59 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:00 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:00 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:01 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:01 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:02 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:02 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:03 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:03 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:04 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:04 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:05 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:06 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:06 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:07 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:07 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:08 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:08 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:09 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:09 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:10 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:10 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:11 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:11 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:12 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:12 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:13 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:13 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:14 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:14 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:15 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:15 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:16 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:16 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:17 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:18 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:18 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:19 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:19 - INFO - __main__ - Step: 30 Loss: 11.0719 LR: 0.000060 +12/10/2025 04:29:19 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:20 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:20 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:21 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:21 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:22 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:22 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:23 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:24 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:24 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:25 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:25 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:26 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:26 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:27 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:27 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:28 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:28 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:29 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:29 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:30 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:31 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:31 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:31 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:32 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:33 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:33 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:34 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:34 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:35 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:35 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:36 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:37 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:37 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:38 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:38 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:39 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:39 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:40 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:40 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:41 - INFO - __main__ - Step: 40 Loss: 11.0682 LR: 0.000060 +12/10/2025 04:29:41 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:42 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:42 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:43 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:44 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:44 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:45 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:45 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:46 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:46 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:47 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:47 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:48 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:48 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:49 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +12/10/2025 04:29:50 - WARNING - __main__ - cond_dropout_prob > 0.0 is not supported with precomputed features. Skipping cond_dropout. +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in + if __name__ == "__main__": + File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1692, in +[rank0]: if __name__ == "__main__": +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1509, in main +[rank0]: +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/files/requirements.txt b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d3a9728ca0269de1a707fb32b43b607a6b6db4b0 --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T04:27:47.773230Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "2", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12139077726208" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "fpeewl20f52v8o7trkfvthix59lgdpw0" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/files/wandb-summary.json b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b7693a059d00dfae50a8041a986a4278b7de3e7c --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/files/wandb-summary.json @@ -0,0 +1 @@ +{"_timestamp":1.7653409814908886e+09,"_step":40,"step_loss":11.068217277526855,"_wandb":{"runtime":122},"_runtime":122.718655158,"lr":5.9999999999999995e-05,"avg_masking_rate":0.8838287591934204} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-core.log b/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..5dd0db9cb88e93a1a09c358e93ad9d46f4fc4aec --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T04:27:47.841707886Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpfheya3w_/port-2616859.txt","pid":2616859,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T04:27:47.842181416Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2616859} +{"time":"2025-12-10T04:27:47.842192259Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2616859-2617110-619976077/socket","Net":"unix"}} +{"time":"2025-12-10T04:27:48.028473168Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T04:27:48.034580582Z","level":"INFO","msg":"handleInformInit: received","streamId":"ig7zestl","id":"1(@)"} +{"time":"2025-12-10T04:27:48.205561021Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ig7zestl","id":"1(@)"} +{"time":"2025-12-10T04:29:51.145346547Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T04:29:51.14542004Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T04:29:51.145409417Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T04:29:51.1454706Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T04:29:51.145521073Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2616859-2617110-619976077/socket","Net":"unix"}} +{"time":"2025-12-10T04:29:51.502096225Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T04:29:51.50213062Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T04:29:51.502146483Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-internal.log b/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8266798a6ae9a41786d6e14559537affba0d8814 --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T04:27:48.034729115Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T04:27:48.205351521Z","level":"INFO","msg":"stream: created new stream","id":"ig7zestl"} +{"time":"2025-12-10T04:27:48.205450319Z","level":"INFO","msg":"handler: started","stream_id":"ig7zestl"} +{"time":"2025-12-10T04:27:48.205554006Z","level":"INFO","msg":"stream: started","id":"ig7zestl"} +{"time":"2025-12-10T04:27:48.205570824Z","level":"INFO","msg":"writer: started","stream_id":"ig7zestl"} +{"time":"2025-12-10T04:27:48.205572875Z","level":"INFO","msg":"sender: started","stream_id":"ig7zestl"} +{"time":"2025-12-10T04:29:51.1454205Z","level":"INFO","msg":"stream: closing","id":"ig7zestl"} +{"time":"2025-12-10T04:29:51.383718617Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T04:29:51.499051314Z","level":"INFO","msg":"handler: closed","stream_id":"ig7zestl"} +{"time":"2025-12-10T04:29:51.499164357Z","level":"INFO","msg":"sender: closed","stream_id":"ig7zestl"} +{"time":"2025-12-10T04:29:51.499175978Z","level":"INFO","msg":"stream: closed","id":"ig7zestl"} diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug.log b/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8d0c5d1d762fb0c41201333c2bc5c6f03d5609e4 --- /dev/null +++ b/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_setup.py:_flush():80] Configure stats pid to 2616859 +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug.log +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_042747-ig7zestl/logs/debug-internal.log +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_init.py:init():841] calling init triggers +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 04:27:47,776 INFO MainThread:2616859 [wandb_init.py:init():889] starting backend +2025-12-10 04:27:48,028 INFO MainThread:2616859 [wandb_init.py:init():892] sending inform_init request +2025-12-10 04:27:48,032 INFO MainThread:2616859 [wandb_init.py:init():900] backend started and connected +2025-12-10 04:27:48,034 INFO MainThread:2616859 [wandb_init.py:init():970] updated telemetry +2025-12-10 04:27:48,038 INFO MainThread:2616859 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 04:27:48,426 INFO MainThread:2616859 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 04:27:48,549 INFO MainThread:2616859 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 04:27:48,549 INFO MainThread:2616859 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 04:27:48,549 INFO MainThread:2616859 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 04:27:48,549 INFO MainThread:2616859 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 04:27:48,551 INFO MainThread:2616859 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 04:27:48,552 INFO MainThread:2616859 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-10 04:29:51,145 INFO wandb-AsyncioManager-main:2616859 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 04:29:51,145 INFO wandb-AsyncioManager-main:2616859 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_042747-ig7zestl/run-ig7zestl.wandb b/Meissonic/wandb/run-20251210_042747-ig7zestl/run-ig7zestl.wandb new file mode 100644 index 0000000000000000000000000000000000000000..94934f37db15ad63dcac62be0934a50ec76c344a Binary files /dev/null and b/Meissonic/wandb/run-20251210_042747-ig7zestl/run-ig7zestl.wandb differ diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/config.yaml b/Meissonic/wandb/run-20251210_043009-5878wpml/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..612d285a19c5ffbc6316038fb6bab4955ae1ac13 --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/files/config.yaml @@ -0,0 +1,303 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + u29b1oh9n7s9k872fah1kpv4yjmrs1rq: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12139077951488" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T04:30:09.598769Z" + writerId: u29b1oh9n7s9k872fah1kpv4yjmrs1rq + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/media/images/generated_videos_first_frame_500_94b9a8ac13767e53dfe4.png b/Meissonic/wandb/run-20251210_043009-5878wpml/files/media/images/generated_videos_first_frame_500_94b9a8ac13767e53dfe4.png new file mode 100644 index 0000000000000000000000000000000000000000..8bff0fa2febd24fc4933bcf5f68c41d8455e50f0 Binary files /dev/null and b/Meissonic/wandb/run-20251210_043009-5878wpml/files/media/images/generated_videos_first_frame_500_94b9a8ac13767e53dfe4.png differ diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/media/images/generated_videos_first_frame_500_ac0cddc93fd7d890391c.png b/Meissonic/wandb/run-20251210_043009-5878wpml/files/media/images/generated_videos_first_frame_500_ac0cddc93fd7d890391c.png new file mode 100644 index 0000000000000000000000000000000000000000..2b6b9a760874b145e4222018368bef6ff46e215d Binary files /dev/null and b/Meissonic/wandb/run-20251210_043009-5878wpml/files/media/images/generated_videos_first_frame_500_ac0cddc93fd7d890391c.png differ diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/output.log b/Meissonic/wandb/run-20251210_043009-5878wpml/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d179946fda1932aeb103a9b8f1a47ea816256f38 --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/files/output.log @@ -0,0 +1,145 @@ +12/10/2025 04:30:10 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 04:30:10 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 04:30:10 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 04:30:10 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 04:30:10 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 04:30:10 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 04:30:10 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 04:30:10 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:30:10 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 04:30:26 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:30:26 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:30:28 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 04:30:30 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 04:30:30 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 04:30:30 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 04:30:30 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 04:30:30 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 04:30:30 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 04:30:30 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 04:30:30 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 04:30:30 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 04:30:30 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 04:30:30 - INFO - __main__ - Dataloader configuration: +12/10/2025 04:30:30 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 04:30:30 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 04:30:30 - INFO - __main__ - - persistent_workers: True +12/10/2025 04:30:30 - INFO - __main__ - - pin_memory: True +12/10/2025 04:30:30 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 04:30:31 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 04:30:31 - INFO - __main__ - Skipping empty_embeds creation - using precomputed features +12/10/2025 04:30:31 - INFO - __main__ - ***** Running training ***** +12/10/2025 04:30:31 - INFO - __main__ - Num training steps = 10000 +12/10/2025 04:30:31 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 04:30:31 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 04:30:31 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 04:30:55 - INFO - __main__ - Step: 10 Loss: 11.0747 LR: 0.000060 +12/10/2025 04:31:12 - INFO - __main__ - Step: 20 Loss: 11.0729 LR: 0.000060 +12/10/2025 04:31:29 - INFO - __main__ - Step: 30 Loss: 11.0707 LR: 0.000060 +12/10/2025 04:31:46 - INFO - __main__ - Step: 40 Loss: 11.0673 LR: 0.000060 +12/10/2025 04:32:04 - INFO - __main__ - Step: 50 Loss: 11.0643 LR: 0.000060 +12/10/2025 04:32:21 - INFO - __main__ - Step: 60 Loss: 11.0605 LR: 0.000060 +12/10/2025 04:32:39 - INFO - __main__ - Step: 70 Loss: 11.0540 LR: 0.000060 +12/10/2025 04:32:56 - INFO - __main__ - Step: 80 Loss: 11.0399 LR: 0.000060 +12/10/2025 04:33:13 - INFO - __main__ - Step: 90 Loss: 11.0209 LR: 0.000060 +12/10/2025 04:33:31 - INFO - __main__ - Step: 100 Loss: 10.9853 LR: 0.000060 +12/10/2025 04:33:48 - INFO - __main__ - Step: 110 Loss: 10.9407 LR: 0.000060 +12/10/2025 04:34:06 - INFO - __main__ - Step: 120 Loss: 10.8765 LR: 0.000060 +12/10/2025 04:34:24 - INFO - __main__ - Step: 130 Loss: 10.8079 LR: 0.000060 +12/10/2025 04:34:41 - INFO - __main__ - Step: 140 Loss: 10.7455 LR: 0.000060 +12/10/2025 04:34:58 - INFO - __main__ - Step: 150 Loss: 10.6594 LR: 0.000060 +12/10/2025 04:35:15 - INFO - __main__ - Step: 160 Loss: 10.6105 LR: 0.000060 +12/10/2025 04:35:33 - INFO - __main__ - Step: 170 Loss: 10.5386 LR: 0.000060 +12/10/2025 04:35:51 - INFO - __main__ - Step: 180 Loss: 10.4894 LR: 0.000060 +12/10/2025 04:36:10 - INFO - __main__ - Step: 190 Loss: 10.4253 LR: 0.000060 +12/10/2025 04:36:30 - INFO - __main__ - Step: 200 Loss: 10.3921 LR: 0.000060 +12/10/2025 04:36:48 - INFO - __main__ - Step: 210 Loss: 10.3110 LR: 0.000060 +12/10/2025 04:37:07 - INFO - __main__ - Step: 220 Loss: 10.2477 LR: 0.000060 +12/10/2025 04:37:26 - INFO - __main__ - Step: 230 Loss: 10.1233 LR: 0.000060 +12/10/2025 04:37:45 - INFO - __main__ - Step: 240 Loss: 9.8485 LR: 0.000060 +12/10/2025 04:38:04 - INFO - __main__ - Step: 250 Loss: 9.7038 LR: 0.000060 +12/10/2025 04:38:23 - INFO - __main__ - Step: 260 Loss: 9.1299 LR: 0.000060 +12/10/2025 04:38:42 - INFO - __main__ - Step: 270 Loss: 9.1336 LR: 0.000060 +12/10/2025 04:39:00 - INFO - __main__ - Step: 280 Loss: 8.3852 LR: 0.000060 +12/10/2025 04:39:18 - INFO - __main__ - Step: 290 Loss: 7.4781 LR: 0.000060 +12/10/2025 04:39:35 - INFO - __main__ - Step: 300 Loss: 7.1441 LR: 0.000060 +12/10/2025 04:39:52 - INFO - __main__ - Step: 310 Loss: 7.0994 LR: 0.000060 +12/10/2025 04:40:09 - INFO - __main__ - Step: 320 Loss: 6.8467 LR: 0.000060 +12/10/2025 04:40:28 - INFO - __main__ - Step: 330 Loss: 4.6846 LR: 0.000060 +12/10/2025 04:40:46 - INFO - __main__ - Step: 340 Loss: 6.1597 LR: 0.000060 +12/10/2025 04:41:04 - INFO - __main__ - Step: 350 Loss: 4.1510 LR: 0.000060 +12/10/2025 04:41:22 - INFO - __main__ - Step: 360 Loss: 3.1741 LR: 0.000060 +12/10/2025 04:41:39 - INFO - __main__ - Step: 370 Loss: 2.4839 LR: 0.000060 +12/10/2025 04:41:57 - INFO - __main__ - Step: 380 Loss: 1.9491 LR: 0.000060 +12/10/2025 04:42:14 - INFO - __main__ - Step: 390 Loss: 1.6005 LR: 0.000060 +12/10/2025 04:42:32 - INFO - __main__ - Step: 400 Loss: 1.2793 LR: 0.000060 +12/10/2025 04:42:49 - INFO - __main__ - Step: 410 Loss: 1.0339 LR: 0.000060 +12/10/2025 04:43:06 - INFO - __main__ - Step: 420 Loss: 0.8393 LR: 0.000060 +12/10/2025 04:43:24 - INFO - __main__ - Step: 430 Loss: 0.7320 LR: 0.000060 +12/10/2025 04:43:41 - INFO - __main__ - Step: 440 Loss: 0.6767 LR: 0.000060 +12/10/2025 04:43:58 - INFO - __main__ - Step: 450 Loss: 2.8503 LR: 0.000060 +12/10/2025 04:44:16 - INFO - __main__ - Step: 460 Loss: 10.3635 LR: 0.000060 +12/10/2025 04:44:34 - INFO - __main__ - Step: 470 Loss: 9.7815 LR: 0.000060 +12/10/2025 04:44:53 - INFO - __main__ - Step: 480 Loss: 8.7190 LR: 0.000060 +12/10/2025 04:45:12 - INFO - __main__ - Step: 490 Loss: 6.0148 LR: 0.000060 +12/10/2025 04:45:31 - INFO - __main__ - Step: 500 Loss: 2.5093 LR: 0.000060 +12/10/2025 04:45:31 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 04:45:38 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 04:45:38 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 04:45:38 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 04:45:38 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 04:45:38 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 04:45:38 - INFO - __main__ - Generating videos for validation... +12/10/2025 04:45:38 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 75.39it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7138.37it/s] +12/10/2025 04:45:55 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 04:45:55 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.24it/s] +12/10/2025 04:46:02 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1693, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1510, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1015.56 MiB is free. Including non-PyTorch memory, this process has 38.49 GiB memory in use. Of the allocated memory 28.91 GiB is allocated by PyTorch, and 8.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1693, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1510, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1015.56 MiB is free. Including non-PyTorch memory, this process has 38.49 GiB memory in use. Of the allocated memory 28.91 GiB is allocated by PyTorch, and 8.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1648, in _clean_up_worker + w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/process.py", line 149, in join + res = self._popen.wait(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait + if not wait([self.sentinel], timeout): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt: diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/requirements.txt b/Meissonic/wandb/run-20251210_043009-5878wpml/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_043009-5878wpml/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d7a3ba58586d478faa4c009fd7f1e00d2d2a46ae --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T04:30:09.598769Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12139077951488" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "u29b1oh9n7s9k872fah1kpv4yjmrs1rq" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/files/wandb-summary.json b/Meissonic/wandb/run-20251210_043009-5878wpml/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..569aef916d0f809203a931f426ea046f75f32249 --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/files/wandb-summary.json @@ -0,0 +1 @@ +{"avg_masking_rate":0.6932775974273682,"_timestamp":1.7653419622041483e+09,"_step":500,"step_loss":2.509253978729248,"generated_videos_first_frame":{"count":2,"filenames":["media/images/generated_videos_first_frame_500_ac0cddc93fd7d890391c.png","media/images/generated_videos_first_frame_500_94b9a8ac13767e53dfe4.png"],"captions":["a cat playing","a girl walking"],"_type":"images/separated","width":128,"height":128,"format":"png"},"_wandb":{"runtime":974},"_runtime":974.663843316,"lr":5.9999999999999995e-05} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-core.log b/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..d91454d8cfdfd3d67b00bb7c4c2502c6acf075e9 --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T04:30:09.698193881Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpxqlj0nzs/port-2620362.txt","pid":2620362,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T04:30:09.698667007Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2620362} +{"time":"2025-12-10T04:30:09.698677597Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2620362-2620607-330668019/socket","Net":"unix"}} +{"time":"2025-12-10T04:30:09.884629696Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T04:30:09.891548314Z","level":"INFO","msg":"handleInformInit: received","streamId":"5878wpml","id":"1(@)"} +{"time":"2025-12-10T04:30:10.061099346Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"5878wpml","id":"1(@)"} +{"time":"2025-12-10T04:46:24.942928232Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T04:46:24.943002572Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T04:46:24.942991389Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T04:46:24.943075829Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T04:46:24.943091418Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2620362-2620607-330668019/socket","Net":"unix"}} +{"time":"2025-12-10T04:46:25.484502998Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T04:46:25.484570442Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T04:46:25.484589679Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-internal.log b/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ceadcd586173e2d4f22d279c6af2444a8093e599 --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T04:30:09.891653528Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T04:30:10.060886723Z","level":"INFO","msg":"stream: created new stream","id":"5878wpml"} +{"time":"2025-12-10T04:30:10.060977449Z","level":"INFO","msg":"handler: started","stream_id":"5878wpml"} +{"time":"2025-12-10T04:30:10.061092102Z","level":"INFO","msg":"stream: started","id":"5878wpml"} +{"time":"2025-12-10T04:30:10.061107932Z","level":"INFO","msg":"writer: started","stream_id":"5878wpml"} +{"time":"2025-12-10T04:30:10.061111572Z","level":"INFO","msg":"sender: started","stream_id":"5878wpml"} +{"time":"2025-12-10T04:46:24.942997072Z","level":"INFO","msg":"stream: closing","id":"5878wpml"} +{"time":"2025-12-10T04:46:25.196277775Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T04:46:25.39496968Z","level":"INFO","msg":"handler: closed","stream_id":"5878wpml"} +{"time":"2025-12-10T04:46:25.395193205Z","level":"INFO","msg":"sender: closed","stream_id":"5878wpml"} +{"time":"2025-12-10T04:46:25.39520522Z","level":"INFO","msg":"stream: closed","id":"5878wpml"} diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug.log b/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..fa09297566893f55a42fe8cd6739d55d124252fb --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 04:30:09,603 INFO MainThread:2620362 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_setup.py:_flush():80] Configure stats pid to 2620362 +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug.log +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_043009-5878wpml/logs/debug-internal.log +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_init.py:init():841] calling init triggers +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 04:30:09,604 INFO MainThread:2620362 [wandb_init.py:init():889] starting backend +2025-12-10 04:30:09,884 INFO MainThread:2620362 [wandb_init.py:init():892] sending inform_init request +2025-12-10 04:30:09,890 INFO MainThread:2620362 [wandb_init.py:init():900] backend started and connected +2025-12-10 04:30:09,891 INFO MainThread:2620362 [wandb_init.py:init():970] updated telemetry +2025-12-10 04:30:09,897 INFO MainThread:2620362 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 04:30:10,278 INFO MainThread:2620362 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 04:30:10,458 INFO MainThread:2620362 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 04:30:10,458 INFO MainThread:2620362 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 04:30:10,458 INFO MainThread:2620362 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 04:30:10,458 INFO MainThread:2620362 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 04:30:10,461 INFO MainThread:2620362 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 04:30:10,462 INFO MainThread:2620362 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features'} +2025-12-10 04:46:24,943 INFO wandb-AsyncioManager-main:2620362 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 04:46:24,943 INFO wandb-AsyncioManager-main:2620362 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_043009-5878wpml/run-5878wpml.wandb b/Meissonic/wandb/run-20251210_043009-5878wpml/run-5878wpml.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b770ff4ed8dc5dc4e99d3e2d23329f14ef28136e --- /dev/null +++ b/Meissonic/wandb/run-20251210_043009-5878wpml/run-5878wpml.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32516d6df52a8fb2f9ec2a69ea11b7f80157f8295a2db93de1a7370464134d0 +size 296403 diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/files/config.yaml b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..166a63f31ea943fcb6f7fd9f38085093b6c82fb8 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/config.yaml @@ -0,0 +1,303 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + b3z1vhpf74rln28qyknvlicz64uo75mz: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12144812875776" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T04:55:07.059107Z" + writerId: b3z1vhpf74rln28qyknvlicz64uo75mz + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/files/output.log b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..58ad0ed8c33de6dc8ce4533fd8c2a118bbf481d7 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/output.log @@ -0,0 +1,18 @@ +12/10/2025 04:55:07 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 04:55:07 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 04:55:07 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 04:55:07 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 04:55:07 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 04:55:07 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1744, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 647, in main + if sample_path: +NameError: name 'sample_path' is not defined +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1744, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 647, in main +[rank0]: if sample_path: +[rank0]: NameError: name 'sample_path' is not defined diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/files/requirements.txt b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6e98bcdff8d61543ae37132ef5cb4ba8566b1649 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T04:55:07.059107Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12144812875776" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "b3z1vhpf74rln28qyknvlicz64uo75mz" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/files/wandb-summary.json b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a620d0c1047a4dd8a400939b6da246ed8063a7 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0},"_runtime":0} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-core.log b/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b8c2057f1fe19f31d351f20925c276726d7dc369 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T04:55:07.132956152Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpdygqy9pg/port-2642308.txt","pid":2642308,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T04:55:07.133486865Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2642308} +{"time":"2025-12-10T04:55:07.133496718Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2642308-2642553-4210231117/socket","Net":"unix"}} +{"time":"2025-12-10T04:55:07.318948519Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T04:55:07.325090184Z","level":"INFO","msg":"handleInformInit: received","streamId":"n9avnzup","id":"1(@)"} +{"time":"2025-12-10T04:55:07.494164812Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"n9avnzup","id":"1(@)"} +{"time":"2025-12-10T04:55:07.811427792Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T04:55:07.811469711Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T04:55:07.811463462Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T04:55:07.811505274Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T04:55:07.811552197Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2642308-2642553-4210231117/socket","Net":"unix"}} +{"time":"2025-12-10T04:55:08.341428152Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T04:55:08.341448567Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T04:55:08.341461221Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-internal.log b/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..176a83f298b6a2c8cc5f623e8ec8184dfdeca4fc --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T04:55:07.325196653Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T04:55:07.493938833Z","level":"INFO","msg":"stream: created new stream","id":"n9avnzup"} +{"time":"2025-12-10T04:55:07.494040848Z","level":"INFO","msg":"handler: started","stream_id":"n9avnzup"} +{"time":"2025-12-10T04:55:07.494157208Z","level":"INFO","msg":"stream: started","id":"n9avnzup"} +{"time":"2025-12-10T04:55:07.494184586Z","level":"INFO","msg":"writer: started","stream_id":"n9avnzup"} +{"time":"2025-12-10T04:55:07.49418817Z","level":"INFO","msg":"sender: started","stream_id":"n9avnzup"} +{"time":"2025-12-10T04:55:07.811473161Z","level":"INFO","msg":"stream: closing","id":"n9avnzup"} +{"time":"2025-12-10T04:55:08.201048352Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T04:55:08.336943071Z","level":"INFO","msg":"handler: closed","stream_id":"n9avnzup"} +{"time":"2025-12-10T04:55:08.337078003Z","level":"INFO","msg":"sender: closed","stream_id":"n9avnzup"} +{"time":"2025-12-10T04:55:08.337085632Z","level":"INFO","msg":"stream: closed","id":"n9avnzup"} diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug.log b/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2406c5fc2017cc1ef8f1b730eeef95ee67b727f2 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_setup.py:_flush():80] Configure stats pid to 2642308 +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug.log +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_045507-n9avnzup/logs/debug-internal.log +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_init.py:init():841] calling init triggers +2025-12-10 04:55:07,062 INFO MainThread:2642308 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 04:55:07,063 INFO MainThread:2642308 [wandb_init.py:init():889] starting backend +2025-12-10 04:55:07,319 INFO MainThread:2642308 [wandb_init.py:init():892] sending inform_init request +2025-12-10 04:55:07,323 INFO MainThread:2642308 [wandb_init.py:init():900] backend started and connected +2025-12-10 04:55:07,324 INFO MainThread:2642308 [wandb_init.py:init():970] updated telemetry +2025-12-10 04:55:07,329 INFO MainThread:2642308 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 04:55:07,672 INFO MainThread:2642308 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 04:55:07,798 INFO MainThread:2642308 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 04:55:07,798 INFO MainThread:2642308 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 04:55:07,798 INFO MainThread:2642308 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 04:55:07,798 INFO MainThread:2642308 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 04:55:07,802 INFO MainThread:2642308 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 04:55:07,803 INFO MainThread:2642308 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 04:55:07,811 INFO wandb-AsyncioManager-main:2642308 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 04:55:07,811 INFO wandb-AsyncioManager-main:2642308 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_045507-n9avnzup/run-n9avnzup.wandb b/Meissonic/wandb/run-20251210_045507-n9avnzup/run-n9avnzup.wandb new file mode 100644 index 0000000000000000000000000000000000000000..54c7be523a3533afb0080b821ddedce5e308b112 Binary files /dev/null and b/Meissonic/wandb/run-20251210_045507-n9avnzup/run-n9avnzup.wandb differ diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/config.yaml b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88fe3fde022d55be4147605405e56fdf06c5bd90 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/config.yaml @@ -0,0 +1,303 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + qqpknxczyu7ian1xhqsu8gzi7ixhu7s2: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12144813121536" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T04:57:17.679940Z" + writerId: qqpknxczyu7ian1xhqsu8gzi7ixhu7s2 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/output.log b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e3a909c10891a42c28df0b838ad6668812fb4b42 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/output.log @@ -0,0 +1,68 @@ +12/10/2025 04:57:18 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 04:57:18 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 04:57:18 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 04:57:18 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 04:57:18 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 04:57:18 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 04:57:18 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 04:57:18 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:57:18 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 04:57:35 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:57:35 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:57:37 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 04:57:39 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 04:57:39 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 04:57:39 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 04:57:39 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 04:57:39 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 04:57:39 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 04:57:39 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 04:57:39 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 04:57:39 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 04:57:39 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 04:57:39 - INFO - __main__ - Dataloader configuration: +12/10/2025 04:57:39 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 04:57:39 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 04:57:39 - INFO - __main__ - - persistent_workers: True +12/10/2025 04:57:39 - INFO - __main__ - - pin_memory: True +12/10/2025 04:57:39 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 04:57:41 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 04:57:41 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 04:57:41 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 04:57:41 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 04:57:41 - INFO - __main__ - ***** Running training ***** +12/10/2025 04:57:41 - INFO - __main__ - Num training steps = 10000 +12/10/2025 04:57:41 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 04:57:41 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 04:57:41 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1744, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1550, in main + loss = F.cross_entropy( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy + return torch._C._nn.cross_entropy_loss( +NotImplementedError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int' +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1744, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1550, in main +[rank0]: loss = F.cross_entropy( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/functional.py", line 3458, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: NotImplementedError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int' +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1648, in _clean_up_worker + w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/process.py", line 149, in join + res = self._popen.wait(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait + if not wait([self.sentinel], timeout): + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt: diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/requirements.txt b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..85eded996083791a44bc231bcb0ff2b87aae26a4 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T04:57:17.679940Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12144813121536" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "qqpknxczyu7ian1xhqsu8gzi7ixhu7s2" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/wandb-summary.json b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..519453af2693820c461935aa95c333853a63586f --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":37,"_wandb":{"runtime":37}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-core.log b/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2031670e93e4b52608c7fe1b8c130fb8fb0d78c3 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T04:57:17.749501651Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp9zt3zwee/port-2645064.txt","pid":2645064,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T04:57:17.750776461Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2645064} +{"time":"2025-12-10T04:57:17.750766186Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2645064-2645236-1292604464/socket","Net":"unix"}} +{"time":"2025-12-10T04:57:17.935921782Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T04:57:17.942016424Z","level":"INFO","msg":"handleInformInit: received","streamId":"nkp4mvju","id":"1(@)"} +{"time":"2025-12-10T04:57:18.108561105Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"nkp4mvju","id":"1(@)"} +{"time":"2025-12-10T04:57:56.198239991Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T04:57:56.198455801Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T04:57:56.198521874Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T04:57:56.198472608Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T04:57:56.198634459Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2645064-2645236-1292604464/socket","Net":"unix"}} +{"time":"2025-12-10T04:57:56.620920763Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T04:57:56.620942501Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T04:57:56.620956696Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-internal.log b/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b3e6e62e93078e68590e096e5f85b3bca407a9f3 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T04:57:17.942109813Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T04:57:18.108374946Z","level":"INFO","msg":"stream: created new stream","id":"nkp4mvju"} +{"time":"2025-12-10T04:57:18.108460354Z","level":"INFO","msg":"handler: started","stream_id":"nkp4mvju"} +{"time":"2025-12-10T04:57:18.108554304Z","level":"INFO","msg":"stream: started","id":"nkp4mvju"} +{"time":"2025-12-10T04:57:18.108573013Z","level":"INFO","msg":"writer: started","stream_id":"nkp4mvju"} +{"time":"2025-12-10T04:57:18.108573172Z","level":"INFO","msg":"sender: started","stream_id":"nkp4mvju"} +{"time":"2025-12-10T04:57:56.198455836Z","level":"INFO","msg":"stream: closing","id":"nkp4mvju"} +{"time":"2025-12-10T04:57:56.506764749Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T04:57:56.617832646Z","level":"INFO","msg":"handler: closed","stream_id":"nkp4mvju"} +{"time":"2025-12-10T04:57:56.617951022Z","level":"INFO","msg":"sender: closed","stream_id":"nkp4mvju"} +{"time":"2025-12-10T04:57:56.617962222Z","level":"INFO","msg":"stream: closed","id":"nkp4mvju"} diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug.log b/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..29d0e0b67b791f29c353d9dcdba7ec7e8531ca5c --- /dev/null +++ b/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_setup.py:_flush():80] Configure stats pid to 2645064 +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug.log +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_045717-nkp4mvju/logs/debug-internal.log +2025-12-10 04:57:17,682 INFO MainThread:2645064 [wandb_init.py:init():841] calling init triggers +2025-12-10 04:57:17,683 INFO MainThread:2645064 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 04:57:17,683 INFO MainThread:2645064 [wandb_init.py:init():889] starting backend +2025-12-10 04:57:17,936 INFO MainThread:2645064 [wandb_init.py:init():892] sending inform_init request +2025-12-10 04:57:17,940 INFO MainThread:2645064 [wandb_init.py:init():900] backend started and connected +2025-12-10 04:57:17,941 INFO MainThread:2645064 [wandb_init.py:init():970] updated telemetry +2025-12-10 04:57:17,946 INFO MainThread:2645064 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 04:57:18,321 INFO MainThread:2645064 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 04:57:18,447 INFO MainThread:2645064 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 04:57:18,447 INFO MainThread:2645064 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 04:57:18,447 INFO MainThread:2645064 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 04:57:18,447 INFO MainThread:2645064 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 04:57:18,450 INFO MainThread:2645064 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 04:57:18,451 INFO MainThread:2645064 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 04:57:56,198 INFO wandb-AsyncioManager-main:2645064 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 04:57:56,198 INFO wandb-AsyncioManager-main:2645064 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_045717-nkp4mvju/run-nkp4mvju.wandb b/Meissonic/wandb/run-20251210_045717-nkp4mvju/run-nkp4mvju.wandb new file mode 100644 index 0000000000000000000000000000000000000000..403090bd8caf4f543ac1e04115c879ceeb255ae2 Binary files /dev/null and b/Meissonic/wandb/run-20251210_045717-nkp4mvju/run-nkp4mvju.wandb differ diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/config.yaml b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..89db082b4599d94656c2e28e01b5359cd4549c56 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/config.yaml @@ -0,0 +1,305 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + jxbyukuh3yklib8f49yg9pa95z492jqn: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12144813256704" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T04:59:34.205846Z" + writerId: jxbyukuh3yklib8f49yg9pa95z492jqn + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/media/images/generated_videos_first_frame_500_bb0a0ba4ea3567f7f45a.png b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/media/images/generated_videos_first_frame_500_bb0a0ba4ea3567f7f45a.png new file mode 100644 index 0000000000000000000000000000000000000000..9182bc4387a0fb657a3ea4b2340bd014a1ecdf89 Binary files /dev/null and b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/media/images/generated_videos_first_frame_500_bb0a0ba4ea3567f7f45a.png differ diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/media/images/generated_videos_first_frame_500_e5bd64a3ce99f0946981.png b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/media/images/generated_videos_first_frame_500_e5bd64a3ce99f0946981.png new file mode 100644 index 0000000000000000000000000000000000000000..01e3df6f09803f3af7ddfcc4acfb9f12267803a8 Binary files /dev/null and b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/media/images/generated_videos_first_frame_500_e5bd64a3ce99f0946981.png differ diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/output.log b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..07d8c65500b96b639ec94154b3a5b4bafba25998 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/output.log @@ -0,0 +1,134 @@ +12/10/2025 04:59:34 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 04:59:34 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 04:59:34 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 04:59:34 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 04:59:34 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 04:59:34 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 04:59:34 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 04:59:34 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:59:35 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 04:59:50 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:59:50 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 04:59:52 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 04:59:54 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 04:59:54 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 04:59:54 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 04:59:54 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 04:59:54 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 04:59:54 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 04:59:54 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 04:59:54 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 04:59:54 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 04:59:54 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 04:59:54 - INFO - __main__ - Dataloader configuration: +12/10/2025 04:59:54 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 04:59:54 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 04:59:54 - INFO - __main__ - - persistent_workers: True +12/10/2025 04:59:54 - INFO - __main__ - - pin_memory: True +12/10/2025 04:59:54 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 04:59:56 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 04:59:56 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 04:59:56 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 04:59:56 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 04:59:56 - INFO - __main__ - ***** Running training ***** +12/10/2025 04:59:56 - INFO - __main__ - Num training steps = 10000 +12/10/2025 04:59:56 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 04:59:56 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 04:59:56 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 05:00:18 - INFO - __main__ - Step: 10 Loss: 11.0753 LR: 0.000060 +12/10/2025 05:00:34 - INFO - __main__ - Step: 20 Loss: 11.0730 LR: 0.000060 +12/10/2025 05:00:51 - INFO - __main__ - Step: 30 Loss: 11.0694 LR: 0.000060 +12/10/2025 05:01:09 - INFO - __main__ - Step: 40 Loss: 11.0665 LR: 0.000060 +12/10/2025 05:01:26 - INFO - __main__ - Step: 50 Loss: 11.0638 LR: 0.000060 +12/10/2025 05:01:43 - INFO - __main__ - Step: 60 Loss: 11.0581 LR: 0.000060 +12/10/2025 05:01:59 - INFO - __main__ - Step: 70 Loss: 11.0504 LR: 0.000060 +12/10/2025 05:02:16 - INFO - __main__ - Step: 80 Loss: 11.0334 LR: 0.000060 +12/10/2025 05:02:32 - INFO - __main__ - Step: 90 Loss: 11.0099 LR: 0.000060 +12/10/2025 05:02:51 - INFO - __main__ - Step: 100 Loss: 10.9695 LR: 0.000060 +12/10/2025 05:03:09 - INFO - __main__ - Step: 110 Loss: 10.9227 LR: 0.000060 +12/10/2025 05:03:27 - INFO - __main__ - Step: 120 Loss: 10.8649 LR: 0.000060 +12/10/2025 05:03:45 - INFO - __main__ - Step: 130 Loss: 10.7969 LR: 0.000060 +12/10/2025 05:04:01 - INFO - __main__ - Step: 140 Loss: 10.7392 LR: 0.000060 +12/10/2025 05:04:18 - INFO - __main__ - Step: 150 Loss: 10.6720 LR: 0.000060 +12/10/2025 05:04:35 - INFO - __main__ - Step: 160 Loss: 10.6267 LR: 0.000060 +12/10/2025 05:04:51 - INFO - __main__ - Step: 170 Loss: 10.5702 LR: 0.000060 +12/10/2025 05:05:08 - INFO - __main__ - Step: 180 Loss: 10.5286 LR: 0.000060 +12/10/2025 05:05:25 - INFO - __main__ - Step: 190 Loss: 10.4715 LR: 0.000060 +12/10/2025 05:05:42 - INFO - __main__ - Step: 200 Loss: 10.4428 LR: 0.000060 +12/10/2025 05:05:58 - INFO - __main__ - Step: 210 Loss: 10.4134 LR: 0.000060 +12/10/2025 05:06:14 - INFO - __main__ - Step: 220 Loss: 10.3862 LR: 0.000060 +12/10/2025 05:06:31 - INFO - __main__ - Step: 230 Loss: 10.3619 LR: 0.000060 +12/10/2025 05:06:47 - INFO - __main__ - Step: 240 Loss: 10.3476 LR: 0.000060 +12/10/2025 05:07:03 - INFO - __main__ - Step: 250 Loss: 10.3287 LR: 0.000060 +12/10/2025 05:07:20 - INFO - __main__ - Step: 260 Loss: 10.3108 LR: 0.000060 +12/10/2025 05:07:37 - INFO - __main__ - Step: 270 Loss: 10.3068 LR: 0.000060 +12/10/2025 05:07:53 - INFO - __main__ - Step: 280 Loss: 10.3050 LR: 0.000060 +12/10/2025 05:08:09 - INFO - __main__ - Step: 290 Loss: 10.2792 LR: 0.000060 +12/10/2025 05:08:26 - INFO - __main__ - Step: 300 Loss: 10.2944 LR: 0.000060 +12/10/2025 05:08:43 - INFO - __main__ - Step: 310 Loss: 10.2807 LR: 0.000060 +12/10/2025 05:08:59 - INFO - __main__ - Step: 320 Loss: 10.2740 LR: 0.000060 +12/10/2025 05:09:16 - INFO - __main__ - Step: 330 Loss: 10.2760 LR: 0.000060 +12/10/2025 05:09:32 - INFO - __main__ - Step: 340 Loss: 10.2531 LR: 0.000060 +12/10/2025 05:09:48 - INFO - __main__ - Step: 350 Loss: 10.2740 LR: 0.000060 +12/10/2025 05:10:05 - INFO - __main__ - Step: 360 Loss: 10.2565 LR: 0.000060 +12/10/2025 05:10:22 - INFO - __main__ - Step: 370 Loss: 10.2202 LR: 0.000060 +12/10/2025 05:10:39 - INFO - __main__ - Step: 380 Loss: 10.2410 LR: 0.000060 +12/10/2025 05:10:55 - INFO - __main__ - Step: 390 Loss: 10.2119 LR: 0.000060 +12/10/2025 05:11:12 - INFO - __main__ - Step: 400 Loss: 10.2321 LR: 0.000060 +12/10/2025 05:11:29 - INFO - __main__ - Step: 410 Loss: 10.2193 LR: 0.000060 +12/10/2025 05:11:46 - INFO - __main__ - Step: 420 Loss: 10.2282 LR: 0.000060 +12/10/2025 05:12:02 - INFO - __main__ - Step: 430 Loss: 10.2097 LR: 0.000060 +12/10/2025 05:12:20 - INFO - __main__ - Step: 440 Loss: 10.2135 LR: 0.000060 +12/10/2025 05:12:36 - INFO - __main__ - Step: 450 Loss: 10.2036 LR: 0.000060 +12/10/2025 05:12:53 - INFO - __main__ - Step: 460 Loss: 10.1900 LR: 0.000060 +12/10/2025 05:13:10 - INFO - __main__ - Step: 470 Loss: 10.1852 LR: 0.000060 +12/10/2025 05:13:27 - INFO - __main__ - Step: 480 Loss: 10.1773 LR: 0.000060 +12/10/2025 05:13:44 - INFO - __main__ - Step: 490 Loss: 10.1748 LR: 0.000060 +12/10/2025 05:14:00 - INFO - __main__ - Step: 500 Loss: 10.1598 LR: 0.000060 +12/10/2025 05:14:00 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 05:14:49 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 05:14:50 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 05:14:50 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 05:14:50 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 05:14:50 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 05:14:50 - INFO - __main__ - Generating videos for validation... +12/10/2025 05:14:50 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 73.06it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 5638.59it/s] +12/10/2025 05:15:05 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 05:15:05 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.31it/s] +12/10/2025 05:15:12 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1748, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1565, in main + accelerator.backward(loss) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward + loss.backward(**kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward + torch.autograd.backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward + _engine_run_backward( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.19 GiB is free. Including non-PyTorch memory, this process has 38.29 GiB memory in use. Of the allocated memory 28.93 GiB is allocated by PyTorch, and 8.25 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1748, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1565, in main +[rank0]: accelerator.backward(loss) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2852, in backward +[rank0]: loss.backward(**kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_tensor.py", line 625, in backward +[rank0]: torch.autograd.backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward +[rank0]: _engine_run_backward( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/autograd/graph.py", line 841, in _engine_run_backward +[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 39.49 GiB of which 1.19 GiB is free. Including non-PyTorch memory, this process has 38.29 GiB memory in use. Of the allocated memory 28.93 GiB is allocated by PyTorch, and 8.25 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/requirements.txt b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d7c204e361c34ad6b83f3e585b5c6922bcdf7d47 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T04:59:34.205846Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12144813256704" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "jxbyukuh3yklib8f49yg9pa95z492jqn" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/wandb-summary.json b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..dbd0ff66613c79f59dbf3d07b10733fb02d8e656 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/files/wandb-summary.json @@ -0,0 +1 @@ +{"avg_masking_rate":0.6341594457626343,"_timestamp":1.7653437126120093e+09,"_step":500,"step_loss":10.159809112548828,"generated_videos_first_frame":{"_type":"images/separated","width":128,"height":128,"format":"png","count":2,"filenames":["media/images/generated_videos_first_frame_500_e5bd64a3ce99f0946981.png","media/images/generated_videos_first_frame_500_bb0a0ba4ea3567f7f45a.png"],"captions":["a cat playing","a girl walking"]},"_wandb":{"runtime":979},"_runtime":979.936700086,"lr":5.9999999999999995e-05} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-core.log b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..c694bf51c9ba042046a5d2702f7c8cbcaca1b608 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T04:59:34.274806638Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpab_coscc/port-2648638.txt","pid":2648638,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T04:59:34.275309337Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2648638} +{"time":"2025-12-10T04:59:34.275320056Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2648638-2648807-3055656174/socket","Net":"unix"}} +{"time":"2025-12-10T04:59:34.461477196Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T04:59:34.467604005Z","level":"INFO","msg":"handleInformInit: received","streamId":"tcqz8xbx","id":"1(@)"} +{"time":"2025-12-10T04:59:34.640293994Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tcqz8xbx","id":"1(@)"} +{"time":"2025-12-10T05:15:54.748146784Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T05:15:54.74821762Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T05:15:54.748208689Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T05:15:54.748281193Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T05:15:54.7483001Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2648638-2648807-3055656174/socket","Net":"unix"}} +{"time":"2025-12-10T05:15:55.146957014Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T05:15:55.146980292Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T05:15:55.146992919Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-internal.log b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..582f6b0b69c0d4fac87b70a965c249f9d9f48770 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T04:59:34.4677486Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T04:59:34.640069359Z","level":"INFO","msg":"stream: created new stream","id":"tcqz8xbx"} +{"time":"2025-12-10T04:59:34.640154163Z","level":"INFO","msg":"handler: started","stream_id":"tcqz8xbx"} +{"time":"2025-12-10T04:59:34.64028702Z","level":"INFO","msg":"stream: started","id":"tcqz8xbx"} +{"time":"2025-12-10T04:59:34.640304652Z","level":"INFO","msg":"writer: started","stream_id":"tcqz8xbx"} +{"time":"2025-12-10T04:59:34.640328965Z","level":"INFO","msg":"sender: started","stream_id":"tcqz8xbx"} +{"time":"2025-12-10T05:15:54.748218828Z","level":"INFO","msg":"stream: closing","id":"tcqz8xbx"} +{"time":"2025-12-10T05:15:54.994339802Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T05:15:55.14391639Z","level":"INFO","msg":"handler: closed","stream_id":"tcqz8xbx"} +{"time":"2025-12-10T05:15:55.143984436Z","level":"INFO","msg":"sender: closed","stream_id":"tcqz8xbx"} +{"time":"2025-12-10T05:15:55.14399513Z","level":"INFO","msg":"stream: closed","id":"tcqz8xbx"} diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug.log b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..a805d58e9d48220efa68b4c4cea7e93b5b03a2fe --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 04:59:34,208 INFO MainThread:2648638 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 04:59:34,208 INFO MainThread:2648638 [wandb_setup.py:_flush():80] Configure stats pid to 2648638 +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug.log +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_045934-tcqz8xbx/logs/debug-internal.log +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_init.py:init():841] calling init triggers +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 04:59:34,209 INFO MainThread:2648638 [wandb_init.py:init():889] starting backend +2025-12-10 04:59:34,461 INFO MainThread:2648638 [wandb_init.py:init():892] sending inform_init request +2025-12-10 04:59:34,466 INFO MainThread:2648638 [wandb_init.py:init():900] backend started and connected +2025-12-10 04:59:34,467 INFO MainThread:2648638 [wandb_init.py:init():970] updated telemetry +2025-12-10 04:59:34,471 INFO MainThread:2648638 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 04:59:34,811 INFO MainThread:2648638 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 04:59:34,935 INFO MainThread:2648638 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 04:59:34,935 INFO MainThread:2648638 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 04:59:34,935 INFO MainThread:2648638 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 04:59:34,935 INFO MainThread:2648638 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 04:59:34,938 INFO MainThread:2648638 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 04:59:34,939 INFO MainThread:2648638 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 05:15:54,748 INFO wandb-AsyncioManager-main:2648638 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 05:15:54,748 INFO wandb-AsyncioManager-main:2648638 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_045934-tcqz8xbx/run-tcqz8xbx.wandb b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/run-tcqz8xbx.wandb new file mode 100644 index 0000000000000000000000000000000000000000..82030cd98f704d2574f0a08fab810d71ef8a9252 --- /dev/null +++ b/Meissonic/wandb/run-20251210_045934-tcqz8xbx/run-tcqz8xbx.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155e81b68779a620ce7ccdf1a999ba679e38fbd50f24d16923fba4bb6f09b2a7 +size 297961 diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/config.yaml b/Meissonic/wandb/run-20251210_065438-svzut638/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16869a19666c20c98292d5cac2dd17d0b2a5377b --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/files/config.yaml @@ -0,0 +1,305 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + allmhscr418v3bp5824ixjkvvxtxqjh3: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12144814084096" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T06:54:38.912502Z" + writerId: allmhscr418v3bp5824ixjkvvxtxqjh3 + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/media/images/generated_videos_first_frame_500_72143d6818e0e98395f3.png b/Meissonic/wandb/run-20251210_065438-svzut638/files/media/images/generated_videos_first_frame_500_72143d6818e0e98395f3.png new file mode 100644 index 0000000000000000000000000000000000000000..8d1f063ce4b0a2f0455285ce26dc0ff4b19dafba Binary files /dev/null and b/Meissonic/wandb/run-20251210_065438-svzut638/files/media/images/generated_videos_first_frame_500_72143d6818e0e98395f3.png differ diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/media/images/generated_videos_first_frame_500_c83cb9544c132f5fbda1.png b/Meissonic/wandb/run-20251210_065438-svzut638/files/media/images/generated_videos_first_frame_500_c83cb9544c132f5fbda1.png new file mode 100644 index 0000000000000000000000000000000000000000..1386ae7093ab1d45484bdbf6b3fa6a823461f94c Binary files /dev/null and b/Meissonic/wandb/run-20251210_065438-svzut638/files/media/images/generated_videos_first_frame_500_c83cb9544c132f5fbda1.png differ diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/output.log b/Meissonic/wandb/run-20251210_065438-svzut638/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..306d8c4d5672843aeada44c0ad0a08d0ad5a6bc5 --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/files/output.log @@ -0,0 +1,119 @@ +12/10/2025 06:54:39 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 06:54:39 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 06:54:39 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 06:54:39 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 06:54:39 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 06:54:39 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 06:54:39 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 06:54:39 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 06:54:39 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 06:54:56 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 06:54:56 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 06:54:57 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 06:54:59 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 06:54:59 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 06:54:59 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 06:54:59 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 06:54:59 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 06:54:59 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 06:54:59 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 06:54:59 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 06:54:59 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 06:54:59 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 06:54:59 - INFO - __main__ - Dataloader configuration: +12/10/2025 06:54:59 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 06:54:59 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 06:54:59 - INFO - __main__ - - persistent_workers: True +12/10/2025 06:54:59 - INFO - __main__ - - pin_memory: True +12/10/2025 06:54:59 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 06:55:01 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 06:55:01 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 06:55:01 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 06:55:01 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 06:55:01 - INFO - __main__ - ***** Running training ***** +12/10/2025 06:55:01 - INFO - __main__ - Num training steps = 10000 +12/10/2025 06:55:01 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 06:55:01 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 06:55:01 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 06:55:23 - INFO - __main__ - Step: 10 Loss: 11.0753 LR: 0.000060 +12/10/2025 06:55:39 - INFO - __main__ - Step: 20 Loss: 11.0730 LR: 0.000060 +12/10/2025 06:55:56 - INFO - __main__ - Step: 30 Loss: 11.0694 LR: 0.000060 +12/10/2025 06:56:13 - INFO - __main__ - Step: 40 Loss: 11.0665 LR: 0.000060 +12/10/2025 06:56:32 - INFO - __main__ - Step: 50 Loss: 11.0638 LR: 0.000060 +12/10/2025 06:56:49 - INFO - __main__ - Step: 60 Loss: 11.0581 LR: 0.000060 +12/10/2025 06:57:05 - INFO - __main__ - Step: 70 Loss: 11.0504 LR: 0.000060 +12/10/2025 06:57:22 - INFO - __main__ - Step: 80 Loss: 11.0334 LR: 0.000060 +12/10/2025 06:57:39 - INFO - __main__ - Step: 90 Loss: 11.0098 LR: 0.000060 +12/10/2025 06:57:55 - INFO - __main__ - Step: 100 Loss: 10.9694 LR: 0.000060 +12/10/2025 06:58:12 - INFO - __main__ - Step: 110 Loss: 10.9226 LR: 0.000060 +12/10/2025 06:58:29 - INFO - __main__ - Step: 120 Loss: 10.8645 LR: 0.000060 +12/10/2025 06:58:46 - INFO - __main__ - Step: 130 Loss: 10.7966 LR: 0.000060 +12/10/2025 06:59:03 - INFO - __main__ - Step: 140 Loss: 10.7388 LR: 0.000060 +12/10/2025 06:59:20 - INFO - __main__ - Step: 150 Loss: 10.6719 LR: 0.000060 +12/10/2025 06:59:38 - INFO - __main__ - Step: 160 Loss: 10.6265 LR: 0.000060 +12/10/2025 06:59:54 - INFO - __main__ - Step: 170 Loss: 10.5702 LR: 0.000060 +12/10/2025 07:00:11 - INFO - __main__ - Step: 180 Loss: 10.5286 LR: 0.000060 +12/10/2025 07:00:27 - INFO - __main__ - Step: 190 Loss: 10.4710 LR: 0.000060 +12/10/2025 07:00:44 - INFO - __main__ - Step: 200 Loss: 10.4425 LR: 0.000060 +12/10/2025 07:01:00 - INFO - __main__ - Step: 210 Loss: 10.4138 LR: 0.000060 +12/10/2025 07:01:16 - INFO - __main__ - Step: 220 Loss: 10.3857 LR: 0.000060 +12/10/2025 07:01:33 - INFO - __main__ - Step: 230 Loss: 10.3620 LR: 0.000060 +12/10/2025 07:01:49 - INFO - __main__ - Step: 240 Loss: 10.3475 LR: 0.000060 +12/10/2025 07:02:06 - INFO - __main__ - Step: 250 Loss: 10.3281 LR: 0.000060 +12/10/2025 07:02:23 - INFO - __main__ - Step: 260 Loss: 10.3099 LR: 0.000060 +12/10/2025 07:02:41 - INFO - __main__ - Step: 270 Loss: 10.3059 LR: 0.000060 +12/10/2025 07:02:59 - INFO - __main__ - Step: 280 Loss: 10.3050 LR: 0.000060 +12/10/2025 07:03:18 - INFO - __main__ - Step: 290 Loss: 10.2775 LR: 0.000060 +12/10/2025 07:03:36 - INFO - __main__ - Step: 300 Loss: 10.2930 LR: 0.000060 +12/10/2025 07:03:55 - INFO - __main__ - Step: 310 Loss: 10.2803 LR: 0.000060 +12/10/2025 07:04:12 - INFO - __main__ - Step: 320 Loss: 10.2736 LR: 0.000060 +12/10/2025 07:04:29 - INFO - __main__ - Step: 330 Loss: 10.2748 LR: 0.000060 +12/10/2025 07:04:46 - INFO - __main__ - Step: 340 Loss: 10.2525 LR: 0.000060 +12/10/2025 07:05:03 - INFO - __main__ - Step: 350 Loss: 10.2732 LR: 0.000060 +12/10/2025 07:05:20 - INFO - __main__ - Step: 360 Loss: 10.2562 LR: 0.000060 +12/10/2025 07:05:36 - INFO - __main__ - Step: 370 Loss: 10.2201 LR: 0.000060 +12/10/2025 07:05:52 - INFO - __main__ - Step: 380 Loss: 10.2399 LR: 0.000060 +12/10/2025 07:06:09 - INFO - __main__ - Step: 390 Loss: 10.2113 LR: 0.000060 +12/10/2025 07:06:25 - INFO - __main__ - Step: 400 Loss: 10.2313 LR: 0.000060 +12/10/2025 07:06:43 - INFO - __main__ - Step: 410 Loss: 10.2194 LR: 0.000060 +12/10/2025 07:07:00 - INFO - __main__ - Step: 420 Loss: 10.2263 LR: 0.000060 +12/10/2025 07:07:16 - INFO - __main__ - Step: 430 Loss: 10.2091 LR: 0.000060 +12/10/2025 07:07:33 - INFO - __main__ - Step: 440 Loss: 10.2121 LR: 0.000060 +12/10/2025 07:07:50 - INFO - __main__ - Step: 450 Loss: 10.2016 LR: 0.000060 +12/10/2025 07:08:07 - INFO - __main__ - Step: 460 Loss: 10.1867 LR: 0.000060 +12/10/2025 07:08:24 - INFO - __main__ - Step: 470 Loss: 10.1775 LR: 0.000060 +12/10/2025 07:08:41 - INFO - __main__ - Step: 480 Loss: 10.1756 LR: 0.000060 +12/10/2025 07:08:57 - INFO - __main__ - Step: 490 Loss: 10.1692 LR: 0.000060 +12/10/2025 07:09:13 - INFO - __main__ - Step: 500 Loss: 10.1555 LR: 0.000060 +12/10/2025 07:09:13 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 07:10:03 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 07:10:03 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 07:10:03 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 07:10:03 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 07:10:03 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 07:10:03 - INFO - __main__ - Generating videos for validation... +12/10/2025 07:10:03 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 71.50it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4328.49it/s] +12/10/2025 07:10:20 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 07:10:20 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.11it/s] +12/10/2025 07:10:27 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +12/10/2025 07:10:27 - INFO - __main__ - Cleaned up validation models and freed GPU memory +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1765, in + main(parse_args()) + File "/mnt/Meissonic/train/train_mei_video.py", line 1466, in main + mask_id = video_tokenizer.mask_token_id # codebook_size +UnboundLocalError: local variable 'video_tokenizer' referenced before assignment +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1765, in +[rank0]: main(parse_args()) +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1466, in main +[rank0]: mask_id = video_tokenizer.mask_token_id # codebook_size +[rank0]: UnboundLocalError: local variable 'video_tokenizer' referenced before assignment diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/requirements.txt b/Meissonic/wandb/run-20251210_065438-svzut638/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_065438-svzut638/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..45ccb5ecaa4d08fdf6a5d21e01b5355286f87696 --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T06:54:38.912502Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12144814084096" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "allmhscr418v3bp5824ixjkvvxtxqjh3" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/files/wandb-summary.json b/Meissonic/wandb/run-20251210_065438-svzut638/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7c42cd11727d450d62991df63dfb701d1506021f --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/files/wandb-summary.json @@ -0,0 +1 @@ +{"step_loss":10.155487060546875,"lr":5.9999999999999995e-05,"generated_videos_first_frame":{"captions":["a cat playing","a girl walking"],"_type":"images/separated","width":128,"height":128,"format":"png","count":2,"filenames":["media/images/generated_videos_first_frame_500_c83cb9544c132f5fbda1.png","media/images/generated_videos_first_frame_500_72143d6818e0e98395f3.png"]},"_wandb":{"runtime":988},"_runtime":988.319789275,"avg_masking_rate":0.6341594457626343,"_timestamp":1.7653506274207036e+09,"_step":500} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-core.log b/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b51c39b563cdc414c99b8950fd02e53f7b5cb1a2 --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T06:54:38.982036417Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpupo0lc_m/port-2759789.txt","pid":2759789,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T06:54:38.982493461Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2759789} +{"time":"2025-12-10T06:54:38.982502882Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2759789-2760029-2313296116/socket","Net":"unix"}} +{"time":"2025-12-10T06:54:39.168820915Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T06:54:39.174838218Z","level":"INFO","msg":"handleInformInit: received","streamId":"svzut638","id":"1(@)"} +{"time":"2025-12-10T06:54:39.339741536Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"svzut638","id":"1(@)"} +{"time":"2025-12-10T07:11:07.910542343Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T07:11:07.910625646Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T07:11:07.910615044Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T07:11:07.910702456Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T07:11:07.910706025Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2759789-2760029-2313296116/socket","Net":"unix"}} +{"time":"2025-12-10T07:11:08.290894957Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T07:11:08.290918847Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T07:11:08.290930553Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-internal.log b/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..96bc03e758f51a41ee2ad8d6d85bac1f15006999 --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T06:54:39.174951785Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T06:54:39.339516206Z","level":"INFO","msg":"stream: created new stream","id":"svzut638"} +{"time":"2025-12-10T06:54:39.3396309Z","level":"INFO","msg":"handler: started","stream_id":"svzut638"} +{"time":"2025-12-10T06:54:39.339734022Z","level":"INFO","msg":"stream: started","id":"svzut638"} +{"time":"2025-12-10T06:54:39.339758383Z","level":"INFO","msg":"sender: started","stream_id":"svzut638"} +{"time":"2025-12-10T06:54:39.339761537Z","level":"INFO","msg":"writer: started","stream_id":"svzut638"} +{"time":"2025-12-10T07:11:07.910612265Z","level":"INFO","msg":"stream: closing","id":"svzut638"} +{"time":"2025-12-10T07:11:08.148078474Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T07:11:08.287808812Z","level":"INFO","msg":"handler: closed","stream_id":"svzut638"} +{"time":"2025-12-10T07:11:08.287886196Z","level":"INFO","msg":"sender: closed","stream_id":"svzut638"} +{"time":"2025-12-10T07:11:08.287897958Z","level":"INFO","msg":"stream: closed","id":"svzut638"} diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug.log b/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..24e08154508293316d84edef844701ec21b1f5d9 --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_setup.py:_flush():80] Configure stats pid to 2759789 +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug.log +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_065438-svzut638/logs/debug-internal.log +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_init.py:init():841] calling init triggers +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 06:54:38,915 INFO MainThread:2759789 [wandb_init.py:init():889] starting backend +2025-12-10 06:54:39,168 INFO MainThread:2759789 [wandb_init.py:init():892] sending inform_init request +2025-12-10 06:54:39,173 INFO MainThread:2759789 [wandb_init.py:init():900] backend started and connected +2025-12-10 06:54:39,174 INFO MainThread:2759789 [wandb_init.py:init():970] updated telemetry +2025-12-10 06:54:39,179 INFO MainThread:2759789 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 06:54:39,590 INFO MainThread:2759789 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 06:54:39,717 INFO MainThread:2759789 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 06:54:39,718 INFO MainThread:2759789 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 06:54:39,718 INFO MainThread:2759789 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 06:54:39,718 INFO MainThread:2759789 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 06:54:39,721 INFO MainThread:2759789 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 06:54:39,722 INFO MainThread:2759789 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 07:11:07,910 INFO wandb-AsyncioManager-main:2759789 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 07:11:07,910 INFO wandb-AsyncioManager-main:2759789 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_065438-svzut638/run-svzut638.wandb b/Meissonic/wandb/run-20251210_065438-svzut638/run-svzut638.wandb new file mode 100644 index 0000000000000000000000000000000000000000..fd3f2d7f556e174a6f6e1edf314f4a2dc66ce2d9 --- /dev/null +++ b/Meissonic/wandb/run-20251210_065438-svzut638/run-svzut638.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbf106f6012581154ef827e713d00f725e5d801d3e39c6b444e835feaf713a4 +size 295153 diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/config.yaml b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..62880696d70f5ea20ae57745533f6a0eb6d7f8be --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/config.yaml @@ -0,0 +1,305 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + ydwpsmj3zkrh2el3ienk1qod2cym9vjb: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12144814755840" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.10.19 + root: /mnt/Meissonic + startedAt: "2025-12-10T07:17:16.641784Z" + writerId: ydwpsmj3zkrh2el3ienk1qod2cym9vjb + m: [] + python_version: 3.10.19 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + "3": + - 61 + "4": 3.10.19 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_1000_0dab8879463923119ab0.png b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_1000_0dab8879463923119ab0.png new file mode 100644 index 0000000000000000000000000000000000000000..c5e53a182159759e474169502edacf59cbc13815 Binary files /dev/null and b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_1000_0dab8879463923119ab0.png differ diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_1000_79ca9194981b4a266799.png b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_1000_79ca9194981b4a266799.png new file mode 100644 index 0000000000000000000000000000000000000000..b882ccb65e9ff567b955e7b765f357236e0d9f1f Binary files /dev/null and b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_1000_79ca9194981b4a266799.png differ diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_500_ca222b2e9040c4c4b096.png b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_500_ca222b2e9040c4c4b096.png new file mode 100644 index 0000000000000000000000000000000000000000..765ed1fe60fdc94e92732ceee140f9372da13bf7 Binary files /dev/null and b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_500_ca222b2e9040c4c4b096.png differ diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_500_edd6b527675de3516a45.png b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_500_edd6b527675de3516a45.png new file mode 100644 index 0000000000000000000000000000000000000000..2f6b95770e16b1495e2ca9e12d958b42ddcce55a Binary files /dev/null and b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/media/images/generated_videos_first_frame_500_edd6b527675de3516a45.png differ diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/output.log b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4bfbedcf9cbee7f17e3a856024c0299b304b0432 --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/output.log @@ -0,0 +1,344 @@ +12/10/2025 07:17:31 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 07:17:31 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 07:17:31 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 07:17:31 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 07:17:31 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 07:17:31 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 07:17:31 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 07:17:31 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 07:17:32 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 07:17:48 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 07:17:48 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 07:17:50 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 07:17:51 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 07:17:51 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 07:17:51 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 07:17:51 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 07:17:51 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 07:17:51 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 07:17:51 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 07:17:51 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 07:17:51 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 07:17:51 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 07:17:51 - INFO - __main__ - Dataloader configuration: +12/10/2025 07:17:51 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 07:17:51 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 07:17:51 - INFO - __main__ - - persistent_workers: True +12/10/2025 07:17:51 - INFO - __main__ - - pin_memory: True +12/10/2025 07:17:51 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 07:17:53 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 07:17:53 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 07:17:53 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 07:17:53 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 07:17:53 - INFO - __main__ - ***** Running training ***** +12/10/2025 07:17:53 - INFO - __main__ - Num training steps = 10000 +12/10/2025 07:17:53 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 07:17:53 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 07:17:53 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 07:18:14 - INFO - __main__ - Step: 10 Loss: 11.0753 LR: 0.000060 +12/10/2025 07:18:31 - INFO - __main__ - Step: 20 Loss: 11.0730 LR: 0.000060 +12/10/2025 07:18:48 - INFO - __main__ - Step: 30 Loss: 11.0695 LR: 0.000060 +12/10/2025 07:19:05 - INFO - __main__ - Step: 40 Loss: 11.0665 LR: 0.000060 +12/10/2025 07:19:21 - INFO - __main__ - Step: 50 Loss: 11.0638 LR: 0.000060 +12/10/2025 07:19:38 - INFO - __main__ - Step: 60 Loss: 11.0581 LR: 0.000060 +12/10/2025 07:19:54 - INFO - __main__ - Step: 70 Loss: 11.0504 LR: 0.000060 +12/10/2025 07:20:11 - INFO - __main__ - Step: 80 Loss: 11.0334 LR: 0.000060 +12/10/2025 07:20:28 - INFO - __main__ - Step: 90 Loss: 11.0099 LR: 0.000060 +12/10/2025 07:20:45 - INFO - __main__ - Step: 100 Loss: 10.9695 LR: 0.000060 +12/10/2025 07:21:02 - INFO - __main__ - Step: 110 Loss: 10.9229 LR: 0.000060 +12/10/2025 07:21:20 - INFO - __main__ - Step: 120 Loss: 10.8647 LR: 0.000060 +12/10/2025 07:21:37 - INFO - __main__ - Step: 130 Loss: 10.7967 LR: 0.000060 +12/10/2025 07:21:54 - INFO - __main__ - Step: 140 Loss: 10.7392 LR: 0.000060 +12/10/2025 07:22:11 - INFO - __main__ - Step: 150 Loss: 10.6721 LR: 0.000060 +12/10/2025 07:22:29 - INFO - __main__ - Step: 160 Loss: 10.6267 LR: 0.000060 +12/10/2025 07:22:45 - INFO - __main__ - Step: 170 Loss: 10.5702 LR: 0.000060 +12/10/2025 07:23:01 - INFO - __main__ - Step: 180 Loss: 10.5286 LR: 0.000060 +12/10/2025 07:23:18 - INFO - __main__ - Step: 190 Loss: 10.4711 LR: 0.000060 +12/10/2025 07:23:34 - INFO - __main__ - Step: 200 Loss: 10.4432 LR: 0.000060 +12/10/2025 07:23:51 - INFO - __main__ - Step: 210 Loss: 10.4134 LR: 0.000060 +12/10/2025 07:24:07 - INFO - __main__ - Step: 220 Loss: 10.3854 LR: 0.000060 +12/10/2025 07:24:23 - INFO - __main__ - Step: 230 Loss: 10.3619 LR: 0.000060 +12/10/2025 07:24:40 - INFO - __main__ - Step: 240 Loss: 10.3476 LR: 0.000060 +12/10/2025 07:24:56 - INFO - __main__ - Step: 250 Loss: 10.3288 LR: 0.000060 +12/10/2025 07:25:13 - INFO - __main__ - Step: 260 Loss: 10.3103 LR: 0.000060 +12/10/2025 07:25:29 - INFO - __main__ - Step: 270 Loss: 10.3060 LR: 0.000060 +12/10/2025 07:25:45 - INFO - __main__ - Step: 280 Loss: 10.3030 LR: 0.000060 +12/10/2025 07:26:02 - INFO - __main__ - Step: 290 Loss: 10.2779 LR: 0.000060 +12/10/2025 07:26:18 - INFO - __main__ - Step: 300 Loss: 10.2935 LR: 0.000060 +12/10/2025 07:26:34 - INFO - __main__ - Step: 310 Loss: 10.2803 LR: 0.000060 +12/10/2025 07:26:51 - INFO - __main__ - Step: 320 Loss: 10.2737 LR: 0.000060 +12/10/2025 07:27:07 - INFO - __main__ - Step: 330 Loss: 10.2748 LR: 0.000060 +12/10/2025 07:27:23 - INFO - __main__ - Step: 340 Loss: 10.2528 LR: 0.000060 +12/10/2025 07:27:39 - INFO - __main__ - Step: 350 Loss: 10.2735 LR: 0.000060 +12/10/2025 07:27:56 - INFO - __main__ - Step: 360 Loss: 10.2561 LR: 0.000060 +12/10/2025 07:28:13 - INFO - __main__ - Step: 370 Loss: 10.2202 LR: 0.000060 +12/10/2025 07:28:29 - INFO - __main__ - Step: 380 Loss: 10.2401 LR: 0.000060 +12/10/2025 07:28:45 - INFO - __main__ - Step: 390 Loss: 10.2113 LR: 0.000060 +12/10/2025 07:29:02 - INFO - __main__ - Step: 400 Loss: 10.2319 LR: 0.000060 +12/10/2025 07:29:18 - INFO - __main__ - Step: 410 Loss: 10.2197 LR: 0.000060 +12/10/2025 07:29:35 - INFO - __main__ - Step: 420 Loss: 10.2274 LR: 0.000060 +12/10/2025 07:29:53 - INFO - __main__ - Step: 430 Loss: 10.2096 LR: 0.000060 +12/10/2025 07:30:11 - INFO - __main__ - Step: 440 Loss: 10.2125 LR: 0.000060 +12/10/2025 07:30:29 - INFO - __main__ - Step: 450 Loss: 10.2030 LR: 0.000060 +12/10/2025 07:30:46 - INFO - __main__ - Step: 460 Loss: 10.1881 LR: 0.000060 +12/10/2025 07:31:02 - INFO - __main__ - Step: 470 Loss: 10.1804 LR: 0.000060 +12/10/2025 07:31:19 - INFO - __main__ - Step: 480 Loss: 10.1765 LR: 0.000060 +12/10/2025 07:31:35 - INFO - __main__ - Step: 490 Loss: 10.1706 LR: 0.000060 +12/10/2025 07:31:51 - INFO - __main__ - Step: 500 Loss: 10.1511 LR: 0.000060 +12/10/2025 07:31:51 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 07:32:41 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 07:32:41 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 07:32:41 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 07:32:41 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 07:32:41 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 07:32:41 - INFO - __main__ - Generating videos for validation... +12/10/2025 07:32:41 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 73.03it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 4336.80it/s] +12/10/2025 07:32:57 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 07:32:57 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.06it/s] +12/10/2025 07:33:04 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +12/10/2025 07:33:04 - INFO - __main__ - Cleaned up validation models and freed GPU memory +12/10/2025 07:33:22 - INFO - __main__ - Step: 510 Loss: 10.1642 LR: 0.000060 +12/10/2025 07:33:39 - INFO - __main__ - Step: 520 Loss: 10.1155 LR: 0.000060 +12/10/2025 07:33:56 - INFO - __main__ - Step: 530 Loss: 10.1195 LR: 0.000060 +12/10/2025 07:34:14 - INFO - __main__ - Step: 540 Loss: 10.0654 LR: 0.000060 +12/10/2025 07:34:31 - INFO - __main__ - Step: 550 Loss: 10.0827 LR: 0.000060 +12/10/2025 07:34:48 - INFO - __main__ - Step: 560 Loss: 10.0255 LR: 0.000060 +12/10/2025 07:35:04 - INFO - __main__ - Step: 570 Loss: 10.0276 LR: 0.000060 +12/10/2025 07:35:21 - INFO - __main__ - Step: 580 Loss: 10.0056 LR: 0.000060 +12/10/2025 07:35:37 - INFO - __main__ - Step: 590 Loss: 9.9365 LR: 0.000060 +12/10/2025 07:35:54 - INFO - __main__ - Step: 600 Loss: 9.8320 LR: 0.000060 +12/10/2025 07:36:10 - INFO - __main__ - Step: 610 Loss: 10.0306 LR: 0.000060 +12/10/2025 07:36:27 - INFO - __main__ - Step: 620 Loss: 9.7723 LR: 0.000060 +12/10/2025 07:36:43 - INFO - __main__ - Step: 630 Loss: 9.6669 LR: 0.000060 +12/10/2025 07:36:59 - INFO - __main__ - Step: 640 Loss: 9.9538 LR: 0.000060 +12/10/2025 07:37:16 - INFO - __main__ - Step: 650 Loss: 9.8120 LR: 0.000060 +12/10/2025 07:37:33 - INFO - __main__ - Step: 660 Loss: 9.8283 LR: 0.000060 +12/10/2025 07:37:49 - INFO - __main__ - Step: 670 Loss: 9.5049 LR: 0.000060 +12/10/2025 07:38:06 - INFO - __main__ - Step: 680 Loss: 9.8073 LR: 0.000060 +12/10/2025 07:38:23 - INFO - __main__ - Step: 690 Loss: 9.7236 LR: 0.000060 +12/10/2025 07:38:40 - INFO - __main__ - Step: 700 Loss: 9.1967 LR: 0.000060 +12/10/2025 07:38:57 - INFO - __main__ - Step: 710 Loss: 9.3345 LR: 0.000060 +12/10/2025 07:39:14 - INFO - __main__ - Step: 720 Loss: 9.0384 LR: 0.000060 +12/10/2025 07:39:31 - INFO - __main__ - Step: 730 Loss: 9.6747 LR: 0.000060 +12/10/2025 07:39:49 - INFO - __main__ - Step: 740 Loss: 9.3858 LR: 0.000060 +12/10/2025 07:40:06 - INFO - __main__ - Step: 750 Loss: 8.8479 LR: 0.000060 +12/10/2025 07:40:23 - INFO - __main__ - Step: 760 Loss: 9.1746 LR: 0.000060 +12/10/2025 07:40:39 - INFO - __main__ - Step: 770 Loss: 8.8764 LR: 0.000060 +12/10/2025 07:40:56 - INFO - __main__ - Step: 780 Loss: 9.0416 LR: 0.000060 +12/10/2025 07:41:14 - INFO - __main__ - Step: 790 Loss: 8.6986 LR: 0.000060 +12/10/2025 07:41:32 - INFO - __main__ - Step: 800 Loss: 8.5523 LR: 0.000060 +12/10/2025 07:41:50 - INFO - __main__ - Step: 810 Loss: 7.5571 LR: 0.000060 +12/10/2025 07:42:08 - INFO - __main__ - Step: 820 Loss: 8.3334 LR: 0.000060 +12/10/2025 07:42:25 - INFO - __main__ - Step: 830 Loss: 7.9969 LR: 0.000060 +12/10/2025 07:42:42 - INFO - __main__ - Step: 840 Loss: 8.5891 LR: 0.000060 +12/10/2025 07:42:58 - INFO - __main__ - Step: 850 Loss: 7.8850 LR: 0.000060 +12/10/2025 07:43:14 - INFO - __main__ - Step: 860 Loss: 8.7087 LR: 0.000060 +12/10/2025 07:43:31 - INFO - __main__ - Step: 870 Loss: 8.1369 LR: 0.000060 +12/10/2025 07:43:48 - INFO - __main__ - Step: 880 Loss: 8.4226 LR: 0.000060 +12/10/2025 07:44:04 - INFO - __main__ - Step: 890 Loss: 8.6348 LR: 0.000060 +12/10/2025 07:44:21 - INFO - __main__ - Step: 900 Loss: 7.6271 LR: 0.000060 +12/10/2025 07:44:37 - INFO - __main__ - Step: 910 Loss: 8.4708 LR: 0.000060 +12/10/2025 07:44:53 - INFO - __main__ - Step: 920 Loss: 7.6171 LR: 0.000060 +12/10/2025 07:45:09 - INFO - __main__ - Step: 930 Loss: 7.9358 LR: 0.000060 +12/10/2025 07:45:27 - INFO - __main__ - Step: 940 Loss: 6.9904 LR: 0.000060 +12/10/2025 07:45:43 - INFO - __main__ - Step: 950 Loss: 7.5480 LR: 0.000060 +12/10/2025 07:46:00 - INFO - __main__ - Step: 960 Loss: 7.7005 LR: 0.000060 +12/10/2025 07:46:17 - INFO - __main__ - Step: 970 Loss: 4.9190 LR: 0.000060 +12/10/2025 07:46:36 - INFO - __main__ - Step: 980 Loss: 7.4162 LR: 0.000060 +12/10/2025 07:46:52 - INFO - __main__ - Step: 990 Loss: 7.0244 LR: 0.000060 +12/10/2025 07:47:10 - INFO - __main__ - Step: 1000 Loss: 6.2988 LR: 0.000060 +12/10/2025 07:47:10 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000 +12/10/2025 07:47:16 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/optimizer.bin +12/10/2025 07:47:16 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/scheduler.bin +12/10/2025 07:47:16 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/sampler.bin +12/10/2025 07:47:16 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_0.pkl +12/10/2025 07:47:16 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000 +12/10/2025 07:47:16 - INFO - __main__ - Generating videos for validation... +12/10/2025 07:47:16 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 74.40it/s] +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 7616.12it/s] +12/10/2025 07:47:32 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 07:47:32 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.15it/s] +12/10/2025 07:47:38 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +12/10/2025 07:47:38 - INFO - __main__ - Cleaned up validation models and freed GPU memory +12/10/2025 07:47:55 - INFO - __main__ - Step: 1010 Loss: 6.0339 LR: 0.000060 +12/10/2025 07:48:12 - INFO - __main__ - Step: 1020 Loss: 5.9514 LR: 0.000060 +12/10/2025 07:48:29 - INFO - __main__ - Step: 1030 Loss: 3.9624 LR: 0.000060 +12/10/2025 07:48:47 - INFO - __main__ - Step: 1040 Loss: 6.0093 LR: 0.000060 +12/10/2025 07:49:05 - INFO - __main__ - Step: 1050 Loss: 6.8974 LR: 0.000060 +12/10/2025 07:49:23 - INFO - __main__ - Step: 1060 Loss: 6.1530 LR: 0.000060 +12/10/2025 07:49:42 - INFO - __main__ - Step: 1070 Loss: 6.1358 LR: 0.000060 +12/10/2025 07:50:00 - INFO - __main__ - Step: 1080 Loss: 4.2982 LR: 0.000060 +12/10/2025 07:50:18 - INFO - __main__ - Step: 1090 Loss: 4.7812 LR: 0.000060 +12/10/2025 07:50:36 - INFO - __main__ - Step: 1100 Loss: 5.3707 LR: 0.000060 +12/10/2025 07:50:55 - INFO - __main__ - Step: 1110 Loss: 5.8028 LR: 0.000060 +12/10/2025 07:51:11 - INFO - __main__ - Step: 1120 Loss: 5.2905 LR: 0.000060 +12/10/2025 07:51:27 - INFO - __main__ - Step: 1130 Loss: 4.7033 LR: 0.000060 +12/10/2025 07:51:44 - INFO - __main__ - Step: 1140 Loss: 4.0230 LR: 0.000060 +12/10/2025 07:52:01 - INFO - __main__ - Step: 1150 Loss: 5.4948 LR: 0.000060 +12/10/2025 07:52:17 - INFO - __main__ - Step: 1160 Loss: 4.7230 LR: 0.000060 +12/10/2025 07:52:34 - INFO - __main__ - Step: 1170 Loss: 4.5157 LR: 0.000060 +12/10/2025 07:52:51 - INFO - __main__ - Step: 1180 Loss: 4.5091 LR: 0.000060 +12/10/2025 07:53:07 - INFO - __main__ - Step: 1190 Loss: 6.7011 LR: 0.000060 +12/10/2025 07:53:23 - INFO - __main__ - Step: 1200 Loss: 2.4988 LR: 0.000060 +12/10/2025 07:53:40 - INFO - __main__ - Step: 1210 Loss: 6.0789 LR: 0.000060 +12/10/2025 07:53:56 - INFO - __main__ - Step: 1220 Loss: 2.7961 LR: 0.000060 +12/10/2025 07:54:13 - INFO - __main__ - Step: 1230 Loss: 3.2556 LR: 0.000060 +12/10/2025 07:54:29 - INFO - __main__ - Step: 1240 Loss: 3.3652 LR: 0.000060 +12/10/2025 07:54:46 - INFO - __main__ - Step: 1250 Loss: 2.4001 LR: 0.000060 +12/10/2025 07:55:03 - INFO - __main__ - Step: 1260 Loss: 3.9875 LR: 0.000060 +12/10/2025 07:55:19 - INFO - __main__ - Step: 1270 Loss: 2.5632 LR: 0.000060 +12/10/2025 07:55:36 - INFO - __main__ - Step: 1280 Loss: 5.4715 LR: 0.000060 +12/10/2025 07:55:53 - INFO - __main__ - Step: 1290 Loss: 3.4570 LR: 0.000060 +12/10/2025 07:56:09 - INFO - __main__ - Step: 1300 Loss: 4.6496 LR: 0.000060 +12/10/2025 07:56:25 - INFO - __main__ - Step: 1310 Loss: 4.8271 LR: 0.000060 +12/10/2025 07:56:42 - INFO - __main__ - Step: 1320 Loss: 1.8501 LR: 0.000060 +12/10/2025 07:56:59 - INFO - __main__ - Step: 1330 Loss: 6.8129 LR: 0.000060 +12/10/2025 07:57:15 - INFO - __main__ - Step: 1340 Loss: 3.6670 LR: 0.000060 +12/10/2025 07:57:32 - INFO - __main__ - Step: 1350 Loss: 4.4606 LR: 0.000060 +12/10/2025 07:57:48 - INFO - __main__ - Step: 1360 Loss: 3.2188 LR: 0.000060 +12/10/2025 07:58:05 - INFO - __main__ - Step: 1370 Loss: 6.0791 LR: 0.000060 +12/10/2025 07:58:22 - INFO - __main__ - Step: 1380 Loss: 3.4297 LR: 0.000060 +12/10/2025 07:58:38 - INFO - __main__ - Step: 1390 Loss: 3.6190 LR: 0.000060 +12/10/2025 07:58:55 - INFO - __main__ - Step: 1400 Loss: 4.6799 LR: 0.000060 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1774, in + if accelerator.is_main_process: + File "/mnt/Meissonic/train/train_mei_video.py", line 1525, in main + logits = model( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward + out_list = torch.utils.checkpoint.checkpoint( + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward + x = cross_attn_ffn(x, context, context_lens, e) + File "/mnt/Meissonic/src/transformer_video.py", line 477, in cross_attn_ffn + x = x + self.cross_attn(self.norm3(x), context, context_lens) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 389, in forward + k = self.norm_k(self.k(context)).view(b, -1, n, d) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1774, in +[rank0]: if accelerator.is_main_process: +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1525, in main +[rank0]: logits = model( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 489, in forward +[rank0]: x = cross_attn_ffn(x, context, context_lens, e) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 477, in cross_attn_ffn +[rank0]: x = x + self.cross_attn(self.norm3(x), context, context_lens) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 389, in forward +[rank0]: k = self.norm_k(self.k(context)).view(b, -1, n, d) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: KeyboardInterrupt diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/requirements.txt b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d01ecad871b6b3baba9900a3b3d370e9205a61d --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/requirements.txt @@ -0,0 +1,151 @@ +ImageIO==2.37.2 +typing-inspection==0.4.2 +av==16.0.1 +dill==0.4.0 +matplotlib==3.10.7 +xxhash==3.6.0 +tap==0.2 +mc_bin_client==1.0.1 +exceptiongroup==1.3.1 +cycler==0.12.1 +einops==0.8.1 +opencv-python==4.12.0.88 +scikit-image==0.25.2 +dashscope==1.25.2 +charset-normalizer==3.4.4 +filelock==3.19.1 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +kiwisolver==1.4.9 +Flask==3.1.2 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +setuptools==80.9.0 +websocket-client==1.9.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +itsdangerous==2.2.0 +pydantic_core==2.41.5 +matrix-game-2.0==0.0.1 +wsproto==1.3.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +nvidia-cusparselt-cu12==0.7.1 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +lightning==2.6.0 +Flask-SocketIO==5.5.1 +torchvision==0.24.1 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +fonttools==4.61.0 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +modelscope==1.32.0 +ftfy==6.3.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +beartype==0.22.8 +dominate==2.9.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +easydict==1.13 +networkx==3.3 +wheel==0.45.1 +timm==1.0.22 +pyparsing==3.2.5 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +pfzy==0.3.4 +httpcore==1.0.9 +multidict==6.7.0 +pycparser==2.23 +regex==2025.11.3 +importlib_metadata==8.7.0 +Werkzeug==3.1.4 +antlr4-python3-runtime==4.9.3 +sentry-sdk==2.46.0 +urllib3==2.5.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +torchmetrics==1.8.2 +cryptography==46.0.3 +omegaconf==2.3.0 +cffi==2.0.0 +packaging==25.0 +inquirerpy==0.3.4 +aiosignal==1.4.0 +MarkupSafe==2.1.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +decord==0.6.0 +async-timeout==5.0.1 +sympy==1.14.0 +numpy==2.1.2 +torch==2.9.1 +diffusers==0.35.2 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +tifffile==2025.5.10 +safetensors==0.7.0 +gitdb==4.0.12 +blinker==1.9.0 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +typer-slim==0.20.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +python-engineio==4.12.3 +lmdb==1.7.5 +nvidia-nvtx-cu12==12.8.90 +fsspec==2025.9.0 +markdown-it-py==4.0.0 +six==1.17.0 +platformdirs==4.5.0 +starlette==0.50.0 +scipy==1.15.3 +pycocotools==2.0.10 +accelerate==1.12.0 +zipp==3.23.0 +propcache==0.4.1 +bidict==0.23.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +simple-websocket==1.1.0 +nvidia-curand-cu12==10.3.9.90 +contourpy==1.3.2 +imageio-ffmpeg==0.6.0 +nvidia-cufft-cu12==11.3.3.83 +pydantic==2.12.5 +pip==25.3 +prompt_toolkit==3.0.52 +pillow==11.3.0 +protobuf==6.33.1 +yarl==1.22.0 +clip==1.0 +nvidia-cudnn-cu12==9.10.2.21 +python-socketio==5.15.0 diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fe02fb11161661d5644632029f93d8d387062b74 --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.10.19", + "startedAt": "2025-12-10T07:17:16.641784Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/matrix-game2/bin/python3.10", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12144814755840" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "ydwpsmj3zkrh2el3ienk1qod2cym9vjb" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/wandb-summary.json b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b4d84d9a4ea753766a191277000a4b01d5a32bc6 --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2493},"_runtime":2493.834201701,"avg_masking_rate":0.6237305402755737,"_timestamp":1.7653535354021459e+09,"_step":1400,"step_loss":4.679940700531006,"lr":5.9999999999999995e-05,"generated_videos_first_frame":{"count":2,"filenames":["media/images/generated_videos_first_frame_1000_79ca9194981b4a266799.png","media/images/generated_videos_first_frame_1000_0dab8879463923119ab0.png"],"captions":["a cat playing","a girl walking"],"_type":"images/separated","width":128,"height":128,"format":"png"}} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-core.log b/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..15ea64394d59c02984a80e416c94e13d83df2b2f --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2025-12-10T07:17:16.714578164Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpomdelq4a/port-2831568.txt","pid":2831568,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T07:17:16.715052713Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2831568} +{"time":"2025-12-10T07:17:16.71505599Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2831568-2831806-3920032682/socket","Net":"unix"}} +{"time":"2025-12-10T07:17:16.901170368Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T07:17:16.90805965Z","level":"INFO","msg":"handleInformInit: received","streamId":"kc9aapl4","id":"1(@)"} +{"time":"2025-12-10T07:17:17.075066279Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"kc9aapl4","id":"1(@)"} +{"time":"2025-12-10T07:59:05.681805405Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T07:59:05.681949364Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T07:59:05.682036992Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T07:59:05.681970304Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T07:59:05.6821211Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-2831568-2831806-3920032682/socket","Net":"unix"}} +{"time":"2025-12-10T07:59:06.131796407Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-internal.log b/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..876aa4da1059b7073ca50b93148f66c2853da60d --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-12-10T07:17:16.908162531Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T07:17:17.074836072Z","level":"INFO","msg":"stream: created new stream","id":"kc9aapl4"} +{"time":"2025-12-10T07:17:17.074944098Z","level":"INFO","msg":"handler: started","stream_id":"kc9aapl4"} +{"time":"2025-12-10T07:17:17.075059244Z","level":"INFO","msg":"stream: started","id":"kc9aapl4"} +{"time":"2025-12-10T07:17:17.075075183Z","level":"INFO","msg":"writer: started","stream_id":"kc9aapl4"} +{"time":"2025-12-10T07:17:17.075079961Z","level":"INFO","msg":"sender: started","stream_id":"kc9aapl4"} +{"time":"2025-12-10T07:59:05.681907531Z","level":"INFO","msg":"stream: closing","id":"kc9aapl4"} diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug.log b/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2329603fb33a68c67234d108c9a0f1b81b04aeb2 --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_setup.py:_flush():80] Configure stats pid to 2831568 +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug.log +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_071716-kc9aapl4/logs/debug-internal.log +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_init.py:init():841] calling init triggers +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 07:17:16,645 INFO MainThread:2831568 [wandb_init.py:init():889] starting backend +2025-12-10 07:17:16,901 INFO MainThread:2831568 [wandb_init.py:init():892] sending inform_init request +2025-12-10 07:17:16,906 INFO MainThread:2831568 [wandb_init.py:init():900] backend started and connected +2025-12-10 07:17:16,908 INFO MainThread:2831568 [wandb_init.py:init():970] updated telemetry +2025-12-10 07:17:16,912 INFO MainThread:2831568 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 07:17:31,847 INFO MainThread:2831568 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 07:17:31,970 INFO MainThread:2831568 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 07:17:31,970 INFO MainThread:2831568 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 07:17:31,970 INFO MainThread:2831568 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 07:17:31,970 INFO MainThread:2831568 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 07:17:31,973 INFO MainThread:2831568 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 07:17:31,974 INFO MainThread:2831568 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 07:59:05,682 INFO wandb-AsyncioManager-main:2831568 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 07:59:05,682 INFO wandb-AsyncioManager-main:2831568 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_071716-kc9aapl4/run-kc9aapl4.wandb b/Meissonic/wandb/run-20251210_071716-kc9aapl4/run-kc9aapl4.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7a59cbcddb54563819bf66448b3463483bc02221 --- /dev/null +++ b/Meissonic/wandb/run-20251210_071716-kc9aapl4/run-kc9aapl4.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e3e14587a993a57ae4b81ee401af9f90403ae0857025856ad896d75cda2d44c +size 720896 diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/config.yaml b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8b4ef07bc459810d940e0adb64777dbee0f98e99 --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/config.yaml @@ -0,0 +1,305 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + 4fj7z9cxfwc9zu6b03d9bmfdsnai8a5p: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12199603720192" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/test/bin/python3.13 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.13.10 + root: /mnt/Meissonic + startedAt: "2025-12-10T10:36:42.008952Z" + writerId: 4fj7z9cxfwc9zu6b03d9bmfdsnai8a5p + m: [] + python_version: 3.13.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + - 105 + "4": 3.13.10 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/output.log b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a937ee97e45916ab4ef0062b8e8e49acdfa71d2f --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/output.log @@ -0,0 +1,211 @@ +12/10/2025 10:36:42 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 10:36:42 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 10:36:42 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 10:36:42 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 10:36:42 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 10:36:42 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 10:36:42 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 10:36:42 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 10:36:42 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 10:37:00 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 10:37:00 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 10:37:03 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 10:37:05 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 10:37:05 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 10:37:05 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 10:37:05 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 10:37:05 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 10:37:05 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 10:37:05 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 10:37:05 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 10:37:05 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 10:37:05 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 10:37:05 - INFO - __main__ - Dataloader configuration: +12/10/2025 10:37:05 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 10:37:05 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 10:37:05 - INFO - __main__ - - persistent_workers: True +12/10/2025 10:37:05 - INFO - __main__ - - pin_memory: True +12/10/2025 10:37:05 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 10:37:06 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 10:37:06 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 10:37:06 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 10:37:06 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 10:37:06 - INFO - __main__ - ***** Running training ***** +12/10/2025 10:37:06 - INFO - __main__ - Num training steps = 10000 +12/10/2025 10:37:06 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 10:37:06 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 10:37:06 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1786, in + main(parse_args()) + ~~~~^^^^^^^^^^^^^^ + File "/mnt/Meissonic/train/train_mei_video.py", line 1525, in main + logits = model( + tokens=input_ids, # [B, F', H', W'] + ...<2 lines>... + y=None, + ) # Returns [B, vocab_size, F', H', W'] + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward + else self._run_ddp_forward(*inputs, **kwargs) + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ + return super().__call__(*args, **kwargs) + ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/accelerate/utils/operations.py", line 819, in forward + return model_forward(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/accelerate/utils/operations.py", line 807, in __call__ + return convert_to_fp32(self.model_forward(*args, **kwargs)) + ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast + return func(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward + out_list = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.backbone), + ...<5 lines>... + **ckpt_kwargs, + ) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_compile.py", line 53, in inner + return disable_fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn + return fn(*args, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint + ret = function(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward + return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward + x = block(x, **kwargs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward + y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/Meissonic/src/transformer_video.py", line 357, in forward + x = flash_attention( + q=rope_apply(q, grid_sizes, freqs), + ...<2 lines>... + k_lens=seq_lens, + window_size=self.window_size) + File "/mnt/Meissonic/src/transformer_video.py", line 124, in flash_attention + assert FLASH_ATTN_2_AVAILABLE + ^^^^^^^^^^^^^^^^^^^^^^ +AssertionError +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1786, in +[rank0]: main(parse_args()) +[rank0]: ~~~~^^^^^^^^^^^^^^ +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1525, in main +[rank0]: logits = model( +[rank0]: tokens=input_ids, # [B, F', H', W'] +[rank0]: ...<2 lines>... +[rank0]: y=None, +[rank0]: ) # Returns [B, vocab_size, F', H', W'] +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/parallel/distributed.py", line 1661, in forward +[rank0]: else self._run_ddp_forward(*inputs, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/parallel/distributed.py", line 1487, in _run_ddp_forward +[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index] +[rank0]: ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_dynamo/eval_frame.py", line 414, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/accelerate/utils/operations.py", line 819, in forward +[rank0]: return model_forward(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/accelerate/utils/operations.py", line 807, in __call__ +[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs)) +[rank0]: ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast +[rank0]: return func(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_dynamo/eval_frame.py", line 832, in compile_wrapper +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1016, in forward +[rank0]: out_list = torch.utils.checkpoint.checkpoint( +[rank0]: create_custom_forward(self.backbone), +[rank0]: ...<5 lines>... +[rank0]: **ckpt_kwargs, +[rank0]: ) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_compile.py", line 53, in inner +[rank0]: return disable_fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn +[rank0]: return fn(*args, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/utils/checkpoint.py", line 503, in checkpoint +[rank0]: ret = function(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 1011, in custom_forward +[rank0]: return module(x=x_in, t=t_in, context=context_in, seq_len=seq_len_in, y=y_in) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 732, in forward +[rank0]: x = block(x, **kwargs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 471, in forward +[rank0]: y = self.self_attn(attn_input, seq_lens, grid_sizes, freqs) +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 357, in forward +[rank0]: x = flash_attention( +[rank0]: q=rope_apply(q, grid_sizes, freqs), +[rank0]: ...<2 lines>... +[rank0]: k_lens=seq_lens, +[rank0]: window_size=self.window_size) +[rank0]: File "/mnt/Meissonic/src/transformer_video.py", line 124, in flash_attention +[rank0]: assert FLASH_ATTN_2_AVAILABLE +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: AssertionError diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/requirements.txt b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d79dc78d38d4cd741be5be11dd4475290a154d2 --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/requirements.txt @@ -0,0 +1,121 @@ +typing-inspection==0.4.2 +dask==2025.11.0 +dill==0.4.0 +ffmpy==1.0.0 +xxhash==3.6.0 +partd==1.4.2 +brotli==1.2.0 +charset-normalizer==3.4.4 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +orjson==3.11.5 +pydantic_core==2.41.5 +groovy==0.1.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +MarkupSafe==3.0.3 +protobuf==6.33.2 +typer==0.20.0 +gradio==6.1.0 +sentry-sdk==2.47.0 +nvidia-cusparselt-cu12==0.7.1 +locket==1.0.0 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +pydub==0.25.1 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +torchvision==0.24.1 +fastapi==0.124.0 +cloudpickle==3.1.2 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +open_clip_torch==3.2.0 +mdurl==0.1.2 +pandas==2.3.3 +toolz==1.1.0 +ftfy==6.3.1 +platformdirs==4.5.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +wheel==0.45.1 +timm==1.0.22 +semantic-version==2.10.0 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +httpcore==1.0.9 +fsspec==2025.10.0 +multidict==6.7.0 +regex==2025.11.3 +gradio_client==2.0.1 +importlib_metadata==8.7.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +filelock==3.20.0 +torchmetrics==1.8.2 +numpy==2.3.5 +uvicorn==0.38.0 +packaging==25.0 +aiosignal==1.4.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +urllib3==2.6.1 +networkx==3.6.1 +setuptools==80.9.0 +sympy==1.14.0 +torch==2.9.1 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +safetensors==0.7.0 +gitdb==4.0.12 +safehttpx==0.1.7 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +pillow==12.0.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +nvidia-nvtx-cu12==12.8.90 +markdown-it-py==4.0.0 +six==1.17.0 +starlette==0.50.0 +audioop-lts==0.2.2 +accelerate==1.12.0 +diffusers==0.36.0 +annotated-doc==0.0.4 +zipp==3.23.0 +propcache==0.4.1 +mpmath==1.3.0 +nvidia-curand-cu12==10.3.9.90 +python-multipart==0.0.20 +nvidia-cufft-cu12==11.3.3.83 +pip==25.3 +aiofiles==24.1.0 +yarl==1.22.0 +nvidia-cudnn-cu12==9.10.2.21 +tomlkit==0.13.3 +pydantic==2.12.4 diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c7439b4cb5ac48814cb0810c311356e386d2aefa --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.13.10", + "startedAt": "2025-12-10T10:36:42.008952Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/test/bin/python3.13", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12199603720192" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "4fj7z9cxfwc9zu6b03d9bmfdsnai8a5p" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/wandb-summary.json b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..aa34b295a76fbe8b47b19e3ac892049d460b7ba8 --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":68},"_runtime":68} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-core.log b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..5780d21d630c6b99da01913e2ac2da61aa1666db --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T10:36:42.091100269Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpen4dnq09/port-1177490.txt","pid":1177490,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T10:36:42.091692568Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1177490} +{"time":"2025-12-10T10:36:42.091708517Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1177490-1177781-2755904903/socket","Net":"unix"}} +{"time":"2025-12-10T10:36:42.277704585Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T10:36:42.28458188Z","level":"INFO","msg":"handleInformInit: received","streamId":"tkz8w8yv","id":"1(@)"} +{"time":"2025-12-10T10:36:42.455192024Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tkz8w8yv","id":"1(@)"} +{"time":"2025-12-10T10:37:51.456333068Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T10:37:51.456411118Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T10:37:51.456431784Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T10:37:51.456575184Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T10:37:51.458699605Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1177490-1177781-2755904903/socket","Net":"unix"}} +{"time":"2025-12-10T10:37:51.77414998Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T10:37:51.774173687Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T10:37:51.774184046Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-internal.log b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..898c0bbd9bc23a4de40f4599f2a43b6d27ede1f7 --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T10:36:42.284695178Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T10:36:42.454961318Z","level":"INFO","msg":"stream: created new stream","id":"tkz8w8yv"} +{"time":"2025-12-10T10:36:42.455049148Z","level":"INFO","msg":"handler: started","stream_id":"tkz8w8yv"} +{"time":"2025-12-10T10:36:42.455185135Z","level":"INFO","msg":"stream: started","id":"tkz8w8yv"} +{"time":"2025-12-10T10:36:42.455198542Z","level":"INFO","msg":"writer: started","stream_id":"tkz8w8yv"} +{"time":"2025-12-10T10:36:42.45520445Z","level":"INFO","msg":"sender: started","stream_id":"tkz8w8yv"} +{"time":"2025-12-10T10:37:51.456438757Z","level":"INFO","msg":"stream: closing","id":"tkz8w8yv"} +{"time":"2025-12-10T10:37:51.672305417Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T10:37:51.770950948Z","level":"INFO","msg":"handler: closed","stream_id":"tkz8w8yv"} +{"time":"2025-12-10T10:37:51.77105342Z","level":"INFO","msg":"sender: closed","stream_id":"tkz8w8yv"} +{"time":"2025-12-10T10:37:51.771061354Z","level":"INFO","msg":"stream: closed","id":"tkz8w8yv"} diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug.log b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e6a588ee6aca52e9d4fecdab90bb641591b29821 --- /dev/null +++ b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_setup.py:_flush():80] Configure stats pid to 1177490 +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug.log +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_103642-tkz8w8yv/logs/debug-internal.log +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_init.py:init():841] calling init triggers +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 10:36:42,010 INFO MainThread:1177490 [wandb_init.py:init():889] starting backend +2025-12-10 10:36:42,278 INFO MainThread:1177490 [wandb_init.py:init():892] sending inform_init request +2025-12-10 10:36:42,283 INFO MainThread:1177490 [wandb_init.py:init():900] backend started and connected +2025-12-10 10:36:42,284 INFO MainThread:1177490 [wandb_init.py:init():970] updated telemetry +2025-12-10 10:36:42,289 INFO MainThread:1177490 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 10:36:42,673 INFO MainThread:1177490 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 10:36:42,760 INFO MainThread:1177490 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 10:36:42,760 INFO MainThread:1177490 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 10:36:42,760 INFO MainThread:1177490 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 10:36:42,760 INFO MainThread:1177490 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 10:36:42,763 INFO MainThread:1177490 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 10:36:42,764 INFO MainThread:1177490 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 10:37:51,456 INFO wandb-AsyncioManager-main:1177490 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 10:37:51,456 INFO wandb-AsyncioManager-main:1177490 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_103642-tkz8w8yv/run-tkz8w8yv.wandb b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/run-tkz8w8yv.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7ad45294dfeddac82165f803b0b3e0fc19dd33b5 Binary files /dev/null and b/Meissonic/wandb/run-20251210_103642-tkz8w8yv/run-tkz8w8yv.wandb differ diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/config.yaml b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b0bbd61f2d547d7828a6bade3e9d464eb29fd58d --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/config.yaml @@ -0,0 +1,307 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + gsh7dojjv7efi33ri5grdscpp81d6a5z: + args: + - --use_precomputed_features + - --features_dir + - /mnt/VideoGen/dataset/OpenVid1M/extracted_features + - --text_encoder_architecture + - umt5-xxl + - --wan_pretrained_path + - Wan-AI/Wan2.1-T2V-1.3B + - --training_from_scratch + - "True" + - --pretrained_model_name_or_path + - dummy + - --wan_backbone_lr_ratio + - "0.2" + - --num_frames + - "17" + - --video_height + - "128" + - --video_width + - "128" + - --dataloader_num_workers + - "8" + - --video_tokenizer_model_id + - Cosmos-0.1-Tokenizer-DV4x8x8 + - --instance_dataset + - OpenVid1MDataset + - --instance_data_dir + - /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv + - --train_batch_size + - "8" + - --gradient_accumulation_steps + - "4" + - --learning_rate + - "3e-4" + - --max_train_steps + - "10000" + - --checkpointing_steps + - "500" + - --validation_steps + - "500" + - --logging_steps + - "10" + - --validation_prompts + - a cat playing + - a girl walking + - --output_dir + - ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio + - --mixed_precision + - bf16 + - --lr_scheduler + - constant + - --lr_warmup_steps + - "0" + - --use_8bit_adam + - --gradient_checkpointing + - --min_masking_rate + - "0.0" + - --cond_dropout_prob + - "0.1" + - --split_vae_encode + - "1" + - --allow_tf32 + - --seed + - "42" + - --report_to + - wandb + codePath: train/train_mei_video.py + codePathLocal: train/train_mei_video.py + cpu_count: 48 + cpu_count_logical: 96 + cudaVersion: "12.8" + disk: + /: + total: "16650112278528" + used: "12200011956224" + email: jinbin5bai@gmail.com + executable: /home/ubuntu/miniconda3/envs/test/bin/python3.13 + git: + commit: 6819d374ef1b86bdedad373aab1121a89687e5cf + remote: https://github.com/viiika/Meissonic.git + gpu: NVIDIA A100-SXM4-40GB + gpu_count: 8 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-71102f28-cd17-57e7-6181-120bf743d23d + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-303ab142-3206-9a14-c758-58ab97d7510e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "42949672960" + name: NVIDIA A100-SXM4-40GB + uuid: GPU-efb2d1fc-1eed-653d-ed51-5273085154ba + host: ip-172-31-91-136 + memory: + total: "1204521451520" + os: Linux-6.8.0-1027-aws-x86_64-with-glibc2.35 + program: /mnt/Meissonic/train/train_mei_video.py + python: CPython 3.13.10 + root: /mnt/Meissonic + startedAt: "2025-12-10T10:58:33.251591Z" + writerId: gsh7dojjv7efi33ri5grdscpp81d6a5z + m: [] + python_version: 3.13.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 83 + - 98 + - 105 + "3": + - 61 + "4": 3.13.10 + "5": 0.23.1 + "6": 4.57.3 + "12": 0.23.1 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0.01 +allow_tf32: + value: true +checkpointing_steps: + value: 500 +checkpoints_total_limit: + value: null +cond_dropout_prob: + value: 0.1 +dataloader_num_workers: + value: 8 +dataloader_prefetch_factor: + value: 2 +ema_decay: + value: 0.9999 +ema_update_after_step: + value: 0 +empty_embeds_path: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +features_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +freeze_wan_backbone: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: true +image_key: + value: null +instance_data_dataset: + value: null +instance_data_dir: + value: /mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv +instance_data_image: + value: null +instance_dataset: + value: OpenVid1MDataset +learning_rate: + value: 0.0003 +logging_dir: + value: logs +logging_steps: + value: 10 +lora_alpha: + value: 32 +lora_r: + value: 16 +lora_target_modules: + value: + - to_q + - to_k + - to_v +lr_scheduler: + value: constant +lr_warmup_steps: + value: 0 +max_grad_norm: + value: 50 +max_train_steps: + value: 10000 +min_masking_rate: + value: 0 +mixed_precision: + value: bf16 +num_frames: + value: 17 +output_dir: + value: ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +pretrained_model_name_or_path: + value: dummy +prompt_key: + value: null +prompt_prefix: + value: null +report_to: + value: wandb +resolution: + value: 512 +resume_from_checkpoint: + value: null +revision: + value: null +scale_lr: + value: false +seed: + value: 42 +split_vae_encode: + value: 1 +text_encoder_architecture: + value: umt5-xxl +text_encoder_lora_alpha: + value: 32 +text_encoder_lora_r: + value: 16 +text_encoder_lora_target_modules: + value: + - to_q + - to_k + - to_v +text_encoder_use_lora: + value: false +train_batch_size: + value: 8 +train_text_encoder: + value: false +training_from_scratch: + value: true +use_8bit_adam: + value: true +use_ema: + value: false +use_lora: + value: false +use_precomputed_features: + value: true +validation_prompts: + value: + - a cat playing + - a girl walking +validation_steps: + value: 500 +variant: + value: null +video_height: + value: 128 +video_tokenizer_model_id: + value: Cosmos-0.1-Tokenizer-DV4x8x8 +video_width: + value: 128 +wan_backbone_lr_ratio: + value: 0.2 +wan_pretrained_path: + value: Wan-AI/Wan2.1-T2V-1.3B diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/output.log b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..aef3cb03f1fa8ed45f682c752a8ddc79d657ef91 --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/output.log @@ -0,0 +1,133 @@ +12/10/2025 10:58:34 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 10:58:34 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 10:58:34 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 10:58:34 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 10:58:34 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 10:58:34 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 10:58:34 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 10:58:34 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 10:58:34 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 10:58:50 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 10:58:50 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 10:58:52 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 10:58:54 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 10:58:54 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 10:58:54 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 10:58:54 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 10:58:54 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 10:58:54 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 10:58:54 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 10:58:54 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 10:58:54 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 10:58:54 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 10:58:54 - INFO - __main__ - Dataloader configuration: +12/10/2025 10:58:54 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 10:58:54 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 10:58:54 - INFO - __main__ - - persistent_workers: True +12/10/2025 10:58:54 - INFO - __main__ - - pin_memory: True +12/10/2025 10:58:54 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 10:58:55 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 10:58:55 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 10:58:55 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 10:58:55 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 10:58:55 - INFO - __main__ - ***** Running training ***** +12/10/2025 10:58:55 - INFO - __main__ - Num training steps = 10000 +12/10/2025 10:58:55 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 10:58:55 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 10:58:55 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 10:59:21 - INFO - __main__ - Step: 10 Loss: 11.0753 LR: 0.000060 +12/10/2025 10:59:37 - INFO - __main__ - Step: 20 Loss: 11.0730 LR: 0.000060 +12/10/2025 10:59:52 - INFO - __main__ - Step: 30 Loss: 11.0695 LR: 0.000060 +12/10/2025 11:00:09 - INFO - __main__ - Step: 40 Loss: 11.0665 LR: 0.000060 +12/10/2025 11:00:27 - INFO - __main__ - Step: 50 Loss: 11.0638 LR: 0.000060 +12/10/2025 11:00:44 - INFO - __main__ - Step: 60 Loss: 11.0581 LR: 0.000060 +12/10/2025 11:01:03 - INFO - __main__ - Step: 70 Loss: 11.0504 LR: 0.000060 +12/10/2025 11:01:21 - INFO - __main__ - Step: 80 Loss: 11.0334 LR: 0.000060 +12/10/2025 11:01:39 - INFO - __main__ - Step: 90 Loss: 11.0099 LR: 0.000060 +12/10/2025 11:01:57 - INFO - __main__ - Step: 100 Loss: 10.9694 LR: 0.000060 +12/10/2025 11:02:15 - INFO - __main__ - Step: 110 Loss: 10.9228 LR: 0.000060 +12/10/2025 11:02:33 - INFO - __main__ - Step: 120 Loss: 10.8648 LR: 0.000060 +12/10/2025 11:02:51 - INFO - __main__ - Step: 130 Loss: 10.7970 LR: 0.000060 +12/10/2025 11:03:09 - INFO - __main__ - Step: 140 Loss: 10.7392 LR: 0.000060 +12/10/2025 11:03:25 - INFO - __main__ - Step: 150 Loss: 10.6722 LR: 0.000060 +12/10/2025 11:03:42 - INFO - __main__ - Step: 160 Loss: 10.6268 LR: 0.000060 +12/10/2025 11:04:00 - INFO - __main__ - Step: 170 Loss: 10.5703 LR: 0.000060 +12/10/2025 11:04:17 - INFO - __main__ - Step: 180 Loss: 10.5288 LR: 0.000060 +12/10/2025 11:04:33 - INFO - __main__ - Step: 190 Loss: 10.4711 LR: 0.000060 +12/10/2025 11:04:49 - INFO - __main__ - Step: 200 Loss: 10.4425 LR: 0.000060 +12/10/2025 11:05:05 - INFO - __main__ - Step: 210 Loss: 10.4134 LR: 0.000060 +12/10/2025 11:05:21 - INFO - __main__ - Step: 220 Loss: 10.3852 LR: 0.000060 +12/10/2025 11:05:37 - INFO - __main__ - Step: 230 Loss: 10.3618 LR: 0.000060 +12/10/2025 11:05:54 - INFO - __main__ - Step: 240 Loss: 10.3473 LR: 0.000060 +12/10/2025 11:06:09 - INFO - __main__ - Step: 250 Loss: 10.3280 LR: 0.000060 +12/10/2025 11:06:25 - INFO - __main__ - Step: 260 Loss: 10.3093 LR: 0.000060 +12/10/2025 11:06:41 - INFO - __main__ - Step: 270 Loss: 10.3049 LR: 0.000060 +12/10/2025 11:06:57 - INFO - __main__ - Step: 280 Loss: 10.3022 LR: 0.000060 +12/10/2025 11:07:13 - INFO - __main__ - Step: 290 Loss: 10.2769 LR: 0.000060 +12/10/2025 11:07:29 - INFO - __main__ - Step: 300 Loss: 10.2934 LR: 0.000060 +12/10/2025 11:07:46 - INFO - __main__ - Step: 310 Loss: 10.2800 LR: 0.000060 +12/10/2025 11:08:02 - INFO - __main__ - Step: 320 Loss: 10.2736 LR: 0.000060 +12/10/2025 11:08:20 - INFO - __main__ - Step: 330 Loss: 10.2749 LR: 0.000060 +12/10/2025 11:08:37 - INFO - __main__ - Step: 340 Loss: 10.2522 LR: 0.000060 +12/10/2025 11:08:56 - INFO - __main__ - Step: 350 Loss: 10.2730 LR: 0.000060 +12/10/2025 11:09:14 - INFO - __main__ - Step: 360 Loss: 10.2558 LR: 0.000060 +12/10/2025 11:09:32 - INFO - __main__ - Step: 370 Loss: 10.2199 LR: 0.000060 +12/10/2025 11:09:50 - INFO - __main__ - Step: 380 Loss: 10.2401 LR: 0.000060 +12/10/2025 11:10:06 - INFO - __main__ - Step: 390 Loss: 10.2107 LR: 0.000060 +12/10/2025 11:10:22 - INFO - __main__ - Step: 400 Loss: 10.2311 LR: 0.000060 +12/10/2025 11:10:39 - INFO - __main__ - Step: 410 Loss: 10.2198 LR: 0.000060 +12/10/2025 11:10:56 - INFO - __main__ - Step: 420 Loss: 10.2267 LR: 0.000060 +12/10/2025 11:11:13 - INFO - __main__ - Step: 430 Loss: 10.2086 LR: 0.000060 +12/10/2025 11:11:29 - INFO - __main__ - Step: 440 Loss: 10.2145 LR: 0.000060 +12/10/2025 11:11:46 - INFO - __main__ - Step: 450 Loss: 10.2026 LR: 0.000060 +12/10/2025 11:12:01 - INFO - __main__ - Step: 460 Loss: 10.1881 LR: 0.000060 +12/10/2025 11:12:18 - INFO - __main__ - Step: 470 Loss: 10.1810 LR: 0.000060 +12/10/2025 11:12:34 - INFO - __main__ - Step: 480 Loss: 10.1763 LR: 0.000060 +12/10/2025 11:12:50 - INFO - __main__ - Step: 490 Loss: 10.1732 LR: 0.000060 +12/10/2025 11:13:07 - INFO - __main__ - Step: 500 Loss: 10.1615 LR: 0.000060 +12/10/2025 11:13:07 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 11:13:56 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 11:13:56 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 11:13:56 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 11:13:56 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 11:13:56 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 11:13:56 - INFO - __main__ - Generating videos for validation... +12/10/2025 11:13:56 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 17.24it/s] +Traceback (most recent call last): + File "/mnt/Meissonic/train/train_mei_video.py", line 1786, in + main(parse_args()) + ~~~~^^^^^^^^^^^^^^ + File "/mnt/Meissonic/train/train_mei_video.py", line 1624, in main + val_tokenizer = T5Tokenizer.from_pretrained(model_id) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/transformers/utils/import_utils.py", line 2157, in __getattribute__ + requires_backends(cls, cls._backends) + ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^ + File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/transformers/utils/import_utils.py", line 2143, in requires_backends + raise ImportError("".join(failed)) +ImportError: +T5Tokenizer requires the SentencePiece library but it was not found in your environment. Check out the instructions on the +installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones +that match your environment. Please note that you may need to restart your runtime after installation. + +[rank0]: Traceback (most recent call last): +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1786, in +[rank0]: main(parse_args()) +[rank0]: ~~~~^^^^^^^^^^^^^^ +[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1624, in main +[rank0]: val_tokenizer = T5Tokenizer.from_pretrained(model_id) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/transformers/utils/import_utils.py", line 2157, in __getattribute__ +[rank0]: requires_backends(cls, cls._backends) +[rank0]: ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/home/ubuntu/miniconda3/envs/test/lib/python3.13/site-packages/transformers/utils/import_utils.py", line 2143, in requires_backends +[rank0]: raise ImportError("".join(failed)) +[rank0]: ImportError: +[rank0]: T5Tokenizer requires the SentencePiece library but it was not found in your environment. Check out the instructions on the +[rank0]: installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones +[rank0]: that match your environment. Please note that you may need to restart your runtime after installation. diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/requirements.txt b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5872439cda379024a8df7829cfc0a29905cc5d6a --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/requirements.txt @@ -0,0 +1,123 @@ +typing-inspection==0.4.2 +dask==2025.11.0 +dill==0.4.0 +ffmpy==1.0.0 +xxhash==3.6.0 +partd==1.4.2 +brotli==1.2.0 +einops==0.8.1 +charset-normalizer==3.4.4 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +orjson==3.11.5 +pydantic_core==2.41.5 +groovy==0.1.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +MarkupSafe==3.0.3 +protobuf==6.33.2 +typer==0.20.0 +gradio==6.1.0 +sentry-sdk==2.47.0 +nvidia-cusparselt-cu12==0.7.1 +locket==1.0.0 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +pydub==0.25.1 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +torchvision==0.24.1 +fastapi==0.124.0 +cloudpickle==3.1.2 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +toolz==1.1.0 +ftfy==6.3.1 +platformdirs==4.5.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +wheel==0.45.1 +timm==1.0.22 +semantic-version==2.10.0 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +httpcore==1.0.9 +fsspec==2025.10.0 +multidict==6.7.0 +regex==2025.11.3 +gradio_client==2.0.1 +importlib_metadata==8.7.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +filelock==3.20.0 +torchmetrics==1.8.2 +numpy==2.3.5 +uvicorn==0.38.0 +packaging==25.0 +aiosignal==1.4.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +urllib3==2.6.1 +networkx==3.6.1 +setuptools==80.9.0 +sympy==1.14.0 +torch==2.9.1 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +safetensors==0.7.0 +gitdb==4.0.12 +safehttpx==0.1.7 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +pillow==12.0.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +nvidia-nvtx-cu12==12.8.90 +markdown-it-py==4.0.0 +six==1.17.0 +starlette==0.50.0 +audioop-lts==0.2.2 +accelerate==1.12.0 +diffusers==0.36.0 +annotated-doc==0.0.4 +zipp==3.23.0 +propcache==0.4.1 +mpmath==1.3.0 +nvidia-curand-cu12==10.3.9.90 +python-multipart==0.0.20 +nvidia-cufft-cu12==11.3.3.83 +pip==25.3 +aiofiles==24.1.0 +yarl==1.22.0 +nvidia-cudnn-cu12==9.10.2.21 +tomlkit==0.13.3 +pydantic==2.12.4 diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..05bca784de2bdbff051bcdb701e8c7c53cb3a717 --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.13.10", + "startedAt": "2025-12-10T10:58:33.251591Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/test/bin/python3.13", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12200011956224" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "gsh7dojjv7efi33ri5grdscpp81d6a5z" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/wandb-summary.json b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7e29653e8870158d3fbebd62907d6fadcde28217 --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/files/wandb-summary.json @@ -0,0 +1 @@ +{"avg_masking_rate":0.6341594457626343,"_timestamp":1.7653651871397667e+09,"_step":500,"_wandb":{"runtime":963},"_runtime":963.417445979,"step_loss":10.161508560180664,"lr":5.9999999999999995e-05} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-core.log b/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..535ffdcb05ce31f7579f901cf41a6b9d66c55e29 --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-10T10:58:33.330145802Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp566f72w7/port-1225720.txt","pid":1225720,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T10:58:33.330717487Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1225720} +{"time":"2025-12-10T10:58:33.330730083Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1225720-1225978-1373833328/socket","Net":"unix"}} +{"time":"2025-12-10T10:58:33.516447235Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T10:58:33.522585352Z","level":"INFO","msg":"handleInformInit: received","streamId":"im5q8jfr","id":"1(@)"} +{"time":"2025-12-10T10:58:33.701451654Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"im5q8jfr","id":"1(@)"} +{"time":"2025-12-10T11:14:37.337228333Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-10T11:14:37.337300901Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-10T11:14:37.337290811Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-10T11:14:37.337379234Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1225720-1225978-1373833328/socket","Net":"unix"}} +{"time":"2025-12-10T11:14:37.337410474Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-10T11:14:37.749424155Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-10T11:14:37.749441581Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-10T11:14:37.749450085Z","level":"INFO","msg":"server is closed"} diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-internal.log b/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6489bd693648981c5f33c8f3ca517cf12c0a3bf6 --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-10T10:58:33.522737058Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T10:58:33.701175344Z","level":"INFO","msg":"stream: created new stream","id":"im5q8jfr"} +{"time":"2025-12-10T10:58:33.701318417Z","level":"INFO","msg":"handler: started","stream_id":"im5q8jfr"} +{"time":"2025-12-10T10:58:33.701444522Z","level":"INFO","msg":"stream: started","id":"im5q8jfr"} +{"time":"2025-12-10T10:58:33.701462771Z","level":"INFO","msg":"sender: started","stream_id":"im5q8jfr"} +{"time":"2025-12-10T10:58:33.701464614Z","level":"INFO","msg":"writer: started","stream_id":"im5q8jfr"} +{"time":"2025-12-10T11:14:37.337303763Z","level":"INFO","msg":"stream: closing","id":"im5q8jfr"} +{"time":"2025-12-10T11:14:37.579507859Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-10T11:14:37.746254099Z","level":"INFO","msg":"handler: closed","stream_id":"im5q8jfr"} +{"time":"2025-12-10T11:14:37.74632182Z","level":"INFO","msg":"sender: closed","stream_id":"im5q8jfr"} +{"time":"2025-12-10T11:14:37.746337064Z","level":"INFO","msg":"stream: closed","id":"im5q8jfr"} diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug.log b/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8e1a09b3b4abe19a93105eef6d134f3ba191fbcf --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug.log @@ -0,0 +1,24 @@ +2025-12-10 10:58:33,252 INFO MainThread:1225720 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 10:58:33,252 INFO MainThread:1225720 [wandb_setup.py:_flush():80] Configure stats pid to 1225720 +2025-12-10 10:58:33,252 INFO MainThread:1225720 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug.log +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_105833-im5q8jfr/logs/debug-internal.log +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_init.py:init():841] calling init triggers +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 10:58:33,253 INFO MainThread:1225720 [wandb_init.py:init():889] starting backend +2025-12-10 10:58:33,516 INFO MainThread:1225720 [wandb_init.py:init():892] sending inform_init request +2025-12-10 10:58:33,521 INFO MainThread:1225720 [wandb_init.py:init():900] backend started and connected +2025-12-10 10:58:33,523 INFO MainThread:1225720 [wandb_init.py:init():970] updated telemetry +2025-12-10 10:58:33,527 INFO MainThread:1225720 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 10:58:33,919 INFO MainThread:1225720 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 10:58:34,008 INFO MainThread:1225720 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 10:58:34,008 INFO MainThread:1225720 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 10:58:34,008 INFO MainThread:1225720 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 10:58:34,008 INFO MainThread:1225720 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 10:58:34,010 INFO MainThread:1225720 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 10:58:34,011 INFO MainThread:1225720 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} +2025-12-10 11:14:37,337 INFO wandb-AsyncioManager-main:1225720 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-10 11:14:37,337 INFO wandb-AsyncioManager-main:1225720 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/Meissonic/wandb/run-20251210_105833-im5q8jfr/run-im5q8jfr.wandb b/Meissonic/wandb/run-20251210_105833-im5q8jfr/run-im5q8jfr.wandb new file mode 100644 index 0000000000000000000000000000000000000000..42ca7803d926fd387492d033753dbad096cc8d68 --- /dev/null +++ b/Meissonic/wandb/run-20251210_105833-im5q8jfr/run-im5q8jfr.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b393857bb43e2b56cce1852678952a81bb1f035db15ae0d9adaaa790911193f5 +size 272716 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1000_56042e5c1e3ad351c124.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1000_56042e5c1e3ad351c124.png new file mode 100644 index 0000000000000000000000000000000000000000..765049cc3b4cfc63f63db750cd28fc118528068c Binary files /dev/null and b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1000_56042e5c1e3ad351c124.png differ diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1000_716aacc5bc223fd0d079.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1000_716aacc5bc223fd0d079.png new file mode 100644 index 0000000000000000000000000000000000000000..7d3ab8f7d36144d635bed78174c827c4afbaaba5 Binary files /dev/null and b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1000_716aacc5bc223fd0d079.png differ diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1500_b05dd99c5903cf5f83ff.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1500_b05dd99c5903cf5f83ff.png new file mode 100644 index 0000000000000000000000000000000000000000..87c6b04798ae294957f3fa0e5cae938c22d94fe6 Binary files /dev/null and b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1500_b05dd99c5903cf5f83ff.png differ diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1500_e65dea2500a19c25f564.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1500_e65dea2500a19c25f564.png new file mode 100644 index 0000000000000000000000000000000000000000..9ccd6d7be42a04df7bd83005a59f98b14d6c1659 Binary files /dev/null and b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_1500_e65dea2500a19c25f564.png differ diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_500_175da7a7f9f39cf2ca22.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_500_175da7a7f9f39cf2ca22.png new file mode 100644 index 0000000000000000000000000000000000000000..e7fe178ee86afd3497018541c3ac27baa8220c05 Binary files /dev/null and b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_500_175da7a7f9f39cf2ca22.png differ diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_500_f225a4471ecef1dbfaeb.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_500_f225a4471ecef1dbfaeb.png new file mode 100644 index 0000000000000000000000000000000000000000..760ada88e30cdf4dc4683a8e4fb7328a4d871a1d Binary files /dev/null and b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_first_frame_500_f225a4471ecef1dbfaeb.png differ diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_654e84862d8c0a13f1b5.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_654e84862d8c0a13f1b5.png new file mode 100644 index 0000000000000000000000000000000000000000..878e99d547a10280cfc35b864fda1c8b7542ffec --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_654e84862d8c0a13f1b5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654e84862d8c0a13f1b5c5321faf51f7e42bdea5337ed399b9d45addce85278a +size 449974 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_cd153009051a7d605018.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_cd153009051a7d605018.png new file mode 100644 index 0000000000000000000000000000000000000000..38f4e347989e21a17df32d0b41ad1253f05db150 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_cd153009051a7d605018.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd153009051a7d6050189d64dffa50ee778bbb8be7b2d706c53a9bfe104bc445 +size 408663 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_56d7d215080b273e9155.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_56d7d215080b273e9155.png new file mode 100644 index 0000000000000000000000000000000000000000..4c7187945690a70279354741cc60575fe8e5d2ab --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_56d7d215080b273e9155.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d7d215080b273e9155fabd655e8d067a165f6e2554fd135b5233869915f077 +size 452138 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_736c017ca88662cd1d11.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_736c017ca88662cd1d11.png new file mode 100644 index 0000000000000000000000000000000000000000..86f2afb4ae6521a3ef13f10a289e3a29c9d7260f --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_736c017ca88662cd1d11.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:736c017ca88662cd1d11e43ada8129c0fb46f23101f73c275e6bfae1f95d7b55 +size 452537 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_4f69c990d95a223b9d06.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_4f69c990d95a223b9d06.png new file mode 100644 index 0000000000000000000000000000000000000000..8396f8c64b07e2f15cad630a54e1f9bac46f8758 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_4f69c990d95a223b9d06.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f69c990d95a223b9d06117c623b8a353a9415a910fddeaf43bddb5fd465c2b5 +size 330201 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_5fc1dbdaeeaef4847234.png b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_5fc1dbdaeeaef4847234.png new file mode 100644 index 0000000000000000000000000000000000000000..6522c3ba8b5dbd7111ac918ba956841f177a9b48 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_5fc1dbdaeeaef4847234.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc1dbdaeeaef4847234f332b9f24bad9d5540cd20f35cd09af9b6b8bd26e96a +size 323804 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/output.log b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..be36da2ad8e60df2d1a2791ff4c093acb0f8f3fc --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/output.log @@ -0,0 +1,252 @@ +12/10/2025 11:44:40 - INFO - __main__ - Using precomputed features - skipping text encoder and video tokenizer loading during training +12/10/2025 11:44:40 - INFO - __main__ - Text encoder and video tokenizer will be loaded only during validation/inference +12/10/2025 11:44:40 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000 +12/10/2025 11:44:40 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000 +12/10/2025 11:44:40 - INFO - __main__ - Getting compressed dimensions from precomputed features... +12/10/2025 11:44:40 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16 +12/10/2025 11:44:40 - INFO - __main__ - Got text_dim from metadata: 4096 +12/10/2025 11:44:40 - INFO - __main__ - Loading Wan config from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 11:44:40 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12 +12/10/2025 11:44:56 - INFO - __main__ - Loading Wan pretrained weights from: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 11:44:56 - INFO - __main__ - Loading weights from HuggingFace Hub: Wan-AI/Wan2.1-T2V-1.3B +12/10/2025 11:44:58 - INFO - __main__ - ✓ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding) +12/10/2025 11:44:59 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833 +12/10/2025 11:44:59 - INFO - __main__ - Wan backbone lr = 0.000060 (base_lr * 0.2) +12/10/2025 11:44:59 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.000300 +12/10/2025 11:44:59 - INFO - __main__ - Creating dataloaders and lr_scheduler +12/10/2025 11:44:59 - INFO - __main__ - Using pre-extracted features from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features +12/10/2025 11:44:59 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features/metadata.json +12/10/2025 11:44:59 - INFO - train.dataset_utils - Total samples in metadata: 1024 +12/10/2025 11:44:59 - INFO - train.dataset_utils - PrecomputedFeatureDataset: 128 samples available +12/10/2025 11:44:59 - INFO - train.dataset_utils - Index range: 0 to 127 +12/10/2025 11:44:59 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True +12/10/2025 11:44:59 - INFO - __main__ - Dataloader configuration: +12/10/2025 11:44:59 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video) +12/10/2025 11:44:59 - INFO - __main__ - - prefetch_factor: 2 +12/10/2025 11:44:59 - INFO - __main__ - - persistent_workers: True +12/10/2025 11:44:59 - INFO - __main__ - - pin_memory: True +12/10/2025 11:44:59 - INFO - __main__ - Preparing model, optimizer and dataloaders +12/10/2025 11:45:04 - INFO - __main__ - Skipping text_encoder.to() - using precomputed features +12/10/2025 11:45:04 - INFO - __main__ - Loading empty_embeds from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy +12/10/2025 11:45:04 - INFO - __main__ - Empty embeds info from metadata: shape=[1, 512, 4096] +12/10/2025 11:45:04 - INFO - __main__ - Loaded empty_embeds: shape=torch.Size([1, 512, 4096]), dtype=torch.bfloat16 +12/10/2025 11:45:04 - INFO - __main__ - ***** Running training ***** +12/10/2025 11:45:04 - INFO - __main__ - Num training steps = 10000 +12/10/2025 11:45:04 - INFO - __main__ - Instantaneous batch size per device = 8 +12/10/2025 11:45:04 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 256 +12/10/2025 11:45:04 - INFO - __main__ - Gradient Accumulation steps = 4 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +[DEBUG] video_tokens: shape=torch.Size([8, 5, 16, 16]), dtype=torch.int32, device=cuda:0 +[DEBUG] encoder_hidden_states: shape=torch.Size([8, 512, 4096]), dtype=torch.bfloat16, device=cuda:0 +12/10/2025 11:45:26 - INFO - __main__ - Step: 10 Loss: 11.0753 LR: 0.000060 +12/10/2025 11:45:44 - INFO - __main__ - Step: 20 Loss: 11.0730 LR: 0.000060 +12/10/2025 11:46:02 - INFO - __main__ - Step: 30 Loss: 11.0695 LR: 0.000060 +12/10/2025 11:46:20 - INFO - __main__ - Step: 40 Loss: 11.0665 LR: 0.000060 +12/10/2025 11:46:37 - INFO - __main__ - Step: 50 Loss: 11.0638 LR: 0.000060 +12/10/2025 11:46:54 - INFO - __main__ - Step: 60 Loss: 11.0581 LR: 0.000060 +12/10/2025 11:47:11 - INFO - __main__ - Step: 70 Loss: 11.0504 LR: 0.000060 +12/10/2025 11:47:28 - INFO - __main__ - Step: 80 Loss: 11.0334 LR: 0.000060 +12/10/2025 11:47:46 - INFO - __main__ - Step: 90 Loss: 11.0099 LR: 0.000060 +12/10/2025 11:48:03 - INFO - __main__ - Step: 100 Loss: 10.9694 LR: 0.000060 +12/10/2025 11:48:19 - INFO - __main__ - Step: 110 Loss: 10.9227 LR: 0.000060 +12/10/2025 11:48:35 - INFO - __main__ - Step: 120 Loss: 10.8646 LR: 0.000060 +12/10/2025 11:48:51 - INFO - __main__ - Step: 130 Loss: 10.7967 LR: 0.000060 +12/10/2025 11:49:07 - INFO - __main__ - Step: 140 Loss: 10.7391 LR: 0.000060 +12/10/2025 11:49:23 - INFO - __main__ - Step: 150 Loss: 10.6720 LR: 0.000060 +12/10/2025 11:49:39 - INFO - __main__ - Step: 160 Loss: 10.6266 LR: 0.000060 +12/10/2025 11:49:55 - INFO - __main__ - Step: 170 Loss: 10.5701 LR: 0.000060 +12/10/2025 11:50:12 - INFO - __main__ - Step: 180 Loss: 10.5286 LR: 0.000060 +12/10/2025 11:50:28 - INFO - __main__ - Step: 190 Loss: 10.4709 LR: 0.000060 +12/10/2025 11:50:44 - INFO - __main__ - Step: 200 Loss: 10.4424 LR: 0.000060 +12/10/2025 11:51:00 - INFO - __main__ - Step: 210 Loss: 10.4137 LR: 0.000060 +12/10/2025 11:51:15 - INFO - __main__ - Step: 220 Loss: 10.3855 LR: 0.000060 +12/10/2025 11:51:30 - INFO - __main__ - Step: 230 Loss: 10.3617 LR: 0.000060 +12/10/2025 11:51:47 - INFO - __main__ - Step: 240 Loss: 10.3473 LR: 0.000060 +12/10/2025 11:52:03 - INFO - __main__ - Step: 250 Loss: 10.3278 LR: 0.000060 +12/10/2025 11:52:18 - INFO - __main__ - Step: 260 Loss: 10.3091 LR: 0.000060 +12/10/2025 11:52:34 - INFO - __main__ - Step: 270 Loss: 10.3043 LR: 0.000060 +12/10/2025 11:52:49 - INFO - __main__ - Step: 280 Loss: 10.3025 LR: 0.000060 +12/10/2025 11:53:05 - INFO - __main__ - Step: 290 Loss: 10.2772 LR: 0.000060 +12/10/2025 11:53:20 - INFO - __main__ - Step: 300 Loss: 10.2931 LR: 0.000060 +12/10/2025 11:53:35 - INFO - __main__ - Step: 310 Loss: 10.2803 LR: 0.000060 +12/10/2025 11:53:52 - INFO - __main__ - Step: 320 Loss: 10.2737 LR: 0.000060 +12/10/2025 11:54:08 - INFO - __main__ - Step: 330 Loss: 10.2750 LR: 0.000060 +12/10/2025 11:54:24 - INFO - __main__ - Step: 340 Loss: 10.2526 LR: 0.000060 +12/10/2025 11:54:41 - INFO - __main__ - Step: 350 Loss: 10.2730 LR: 0.000060 +12/10/2025 11:54:56 - INFO - __main__ - Step: 360 Loss: 10.2564 LR: 0.000060 +12/10/2025 11:55:11 - INFO - __main__ - Step: 370 Loss: 10.2203 LR: 0.000060 +12/10/2025 11:55:26 - INFO - __main__ - Step: 380 Loss: 10.2405 LR: 0.000060 +12/10/2025 11:55:41 - INFO - __main__ - Step: 390 Loss: 10.2115 LR: 0.000060 +12/10/2025 11:55:56 - INFO - __main__ - Step: 400 Loss: 10.2321 LR: 0.000060 +12/10/2025 11:56:12 - INFO - __main__ - Step: 410 Loss: 10.2205 LR: 0.000060 +12/10/2025 11:56:27 - INFO - __main__ - Step: 420 Loss: 10.2271 LR: 0.000060 +12/10/2025 11:56:42 - INFO - __main__ - Step: 430 Loss: 10.2097 LR: 0.000060 +12/10/2025 11:56:58 - INFO - __main__ - Step: 440 Loss: 10.2136 LR: 0.000060 +12/10/2025 11:57:14 - INFO - __main__ - Step: 450 Loss: 10.2033 LR: 0.000060 +12/10/2025 11:57:30 - INFO - __main__ - Step: 460 Loss: 10.1879 LR: 0.000060 +12/10/2025 11:57:46 - INFO - __main__ - Step: 470 Loss: 10.1792 LR: 0.000060 +12/10/2025 11:58:02 - INFO - __main__ - Step: 480 Loss: 10.1799 LR: 0.000060 +12/10/2025 11:58:18 - INFO - __main__ - Step: 490 Loss: 10.1722 LR: 0.000060 +12/10/2025 11:58:35 - INFO - __main__ - Step: 500 Loss: 10.1545 LR: 0.000060 +12/10/2025 11:58:35 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 11:59:26 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/optimizer.bin +12/10/2025 11:59:26 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/scheduler.bin +12/10/2025 11:59:26 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/sampler.bin +12/10/2025 11:59:26 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500/random_states_0.pkl +12/10/2025 11:59:26 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-500 +12/10/2025 11:59:26 - INFO - __main__ - Generating videos for validation... +12/10/2025 11:59:26 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 94.25it/s] +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3110.18it/s] +12/10/2025 11:59:43 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 11:59:43 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.46it/s] +12/10/2025 11:59:50 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +12/10/2025 11:59:50 - INFO - __main__ - Cleaned up validation models and freed GPU memory +12/10/2025 12:00:08 - INFO - __main__ - Step: 510 Loss: 10.1603 LR: 0.000060 +12/10/2025 12:00:24 - INFO - __main__ - Step: 520 Loss: 10.1230 LR: 0.000060 +12/10/2025 12:00:40 - INFO - __main__ - Step: 530 Loss: 10.1212 LR: 0.000060 +12/10/2025 12:00:55 - INFO - __main__ - Step: 540 Loss: 10.0672 LR: 0.000060 +12/10/2025 12:01:11 - INFO - __main__ - Step: 550 Loss: 10.0958 LR: 0.000060 +12/10/2025 12:01:27 - INFO - __main__ - Step: 560 Loss: 10.0563 LR: 0.000060 +12/10/2025 12:01:43 - INFO - __main__ - Step: 570 Loss: 10.0529 LR: 0.000060 +12/10/2025 12:02:00 - INFO - __main__ - Step: 580 Loss: 10.0877 LR: 0.000060 +12/10/2025 12:02:16 - INFO - __main__ - Step: 590 Loss: 10.0004 LR: 0.000060 +12/10/2025 12:02:32 - INFO - __main__ - Step: 600 Loss: 9.8712 LR: 0.000060 +12/10/2025 12:02:48 - INFO - __main__ - Step: 610 Loss: 10.0444 LR: 0.000060 +12/10/2025 12:03:04 - INFO - __main__ - Step: 620 Loss: 9.8617 LR: 0.000060 +12/10/2025 12:03:20 - INFO - __main__ - Step: 630 Loss: 9.7717 LR: 0.000060 +12/10/2025 12:03:38 - INFO - __main__ - Step: 640 Loss: 9.8441 LR: 0.000060 +12/10/2025 12:03:56 - INFO - __main__ - Step: 650 Loss: 9.7777 LR: 0.000060 +12/10/2025 12:04:14 - INFO - __main__ - Step: 660 Loss: 10.0624 LR: 0.000060 +12/10/2025 12:04:32 - INFO - __main__ - Step: 670 Loss: 9.7147 LR: 0.000060 +12/10/2025 12:04:48 - INFO - __main__ - Step: 680 Loss: 9.8170 LR: 0.000060 +12/10/2025 12:05:05 - INFO - __main__ - Step: 690 Loss: 9.7774 LR: 0.000060 +12/10/2025 12:05:21 - INFO - __main__ - Step: 700 Loss: 9.4032 LR: 0.000060 +12/10/2025 12:05:37 - INFO - __main__ - Step: 710 Loss: 9.3151 LR: 0.000060 +12/10/2025 12:05:53 - INFO - __main__ - Step: 720 Loss: 9.0900 LR: 0.000060 +12/10/2025 12:06:08 - INFO - __main__ - Step: 730 Loss: 9.7551 LR: 0.000060 +12/10/2025 12:06:24 - INFO - __main__ - Step: 740 Loss: 9.2468 LR: 0.000060 +12/10/2025 12:06:39 - INFO - __main__ - Step: 750 Loss: 8.9362 LR: 0.000060 +12/10/2025 12:06:56 - INFO - __main__ - Step: 760 Loss: 9.3001 LR: 0.000060 +12/10/2025 12:07:14 - INFO - __main__ - Step: 770 Loss: 8.9890 LR: 0.000060 +12/10/2025 12:07:31 - INFO - __main__ - Step: 780 Loss: 9.1591 LR: 0.000060 +12/10/2025 12:07:49 - INFO - __main__ - Step: 790 Loss: 8.8347 LR: 0.000060 +12/10/2025 12:08:07 - INFO - __main__ - Step: 800 Loss: 8.5979 LR: 0.000060 +12/10/2025 12:08:25 - INFO - __main__ - Step: 810 Loss: 7.7741 LR: 0.000060 +12/10/2025 12:08:43 - INFO - __main__ - Step: 820 Loss: 8.4691 LR: 0.000060 +12/10/2025 12:09:01 - INFO - __main__ - Step: 830 Loss: 8.2816 LR: 0.000060 +12/10/2025 12:09:18 - INFO - __main__ - Step: 840 Loss: 8.7354 LR: 0.000060 +12/10/2025 12:09:36 - INFO - __main__ - Step: 850 Loss: 7.8491 LR: 0.000060 +12/10/2025 12:09:53 - INFO - __main__ - Step: 860 Loss: 9.0077 LR: 0.000060 +12/10/2025 12:10:11 - INFO - __main__ - Step: 870 Loss: 8.2152 LR: 0.000060 +12/10/2025 12:10:28 - INFO - __main__ - Step: 880 Loss: 8.5758 LR: 0.000060 +12/10/2025 12:10:44 - INFO - __main__ - Step: 890 Loss: 8.3967 LR: 0.000060 +12/10/2025 12:11:00 - INFO - __main__ - Step: 900 Loss: 7.8143 LR: 0.000060 +12/10/2025 12:11:16 - INFO - __main__ - Step: 910 Loss: 8.2686 LR: 0.000060 +12/10/2025 12:11:32 - INFO - __main__ - Step: 920 Loss: 7.6982 LR: 0.000060 +12/10/2025 12:11:47 - INFO - __main__ - Step: 930 Loss: 7.9535 LR: 0.000060 +12/10/2025 12:12:03 - INFO - __main__ - Step: 940 Loss: 7.0900 LR: 0.000060 +12/10/2025 12:12:19 - INFO - __main__ - Step: 950 Loss: 7.5838 LR: 0.000060 +12/10/2025 12:12:35 - INFO - __main__ - Step: 960 Loss: 7.6097 LR: 0.000060 +12/10/2025 12:12:50 - INFO - __main__ - Step: 970 Loss: 4.9570 LR: 0.000060 +12/10/2025 12:13:06 - INFO - __main__ - Step: 980 Loss: 7.8306 LR: 0.000060 +12/10/2025 12:13:24 - INFO - __main__ - Step: 990 Loss: 7.1266 LR: 0.000060 +12/10/2025 12:13:41 - INFO - __main__ - Step: 1000 Loss: 6.5655 LR: 0.000060 +12/10/2025 12:13:41 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000 +12/10/2025 12:14:30 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/optimizer.bin +12/10/2025 12:14:30 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/scheduler.bin +12/10/2025 12:14:30 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/sampler.bin +12/10/2025 12:14:30 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000/random_states_0.pkl +12/10/2025 12:14:30 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1000 +12/10/2025 12:14:30 - INFO - __main__ - Generating videos for validation... +12/10/2025 12:14:30 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 97.14it/s] +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 6364.65it/s] +12/10/2025 12:14:46 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 12:14:46 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.36it/s] +12/10/2025 12:14:52 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +12/10/2025 12:14:52 - INFO - __main__ - Cleaned up validation models and freed GPU memory +12/10/2025 12:15:08 - INFO - __main__ - Step: 1010 Loss: 6.2143 LR: 0.000060 +12/10/2025 12:15:25 - INFO - __main__ - Step: 1020 Loss: 6.1127 LR: 0.000060 +12/10/2025 12:15:43 - INFO - __main__ - Step: 1030 Loss: 4.2783 LR: 0.000060 +12/10/2025 12:16:00 - INFO - __main__ - Step: 1040 Loss: 6.1687 LR: 0.000060 +12/10/2025 12:16:18 - INFO - __main__ - Step: 1050 Loss: 7.1046 LR: 0.000060 +12/10/2025 12:16:35 - INFO - __main__ - Step: 1060 Loss: 6.0546 LR: 0.000060 +12/10/2025 12:16:50 - INFO - __main__ - Step: 1070 Loss: 6.2354 LR: 0.000060 +12/10/2025 12:17:06 - INFO - __main__ - Step: 1080 Loss: 4.4416 LR: 0.000060 +12/10/2025 12:17:22 - INFO - __main__ - Step: 1090 Loss: 5.3321 LR: 0.000060 +12/10/2025 12:17:39 - INFO - __main__ - Step: 1100 Loss: 5.4109 LR: 0.000060 +12/10/2025 12:17:56 - INFO - __main__ - Step: 1110 Loss: 5.9100 LR: 0.000060 +12/10/2025 12:18:11 - INFO - __main__ - Step: 1120 Loss: 5.4545 LR: 0.000060 +12/10/2025 12:18:28 - INFO - __main__ - Step: 1130 Loss: 4.8019 LR: 0.000060 +12/10/2025 12:18:43 - INFO - __main__ - Step: 1140 Loss: 4.0381 LR: 0.000060 +12/10/2025 12:18:59 - INFO - __main__ - Step: 1150 Loss: 5.6605 LR: 0.000060 +12/10/2025 12:19:15 - INFO - __main__ - Step: 1160 Loss: 4.4688 LR: 0.000060 +12/10/2025 12:19:30 - INFO - __main__ - Step: 1170 Loss: 4.5869 LR: 0.000060 +12/10/2025 12:19:46 - INFO - __main__ - Step: 1180 Loss: 4.6429 LR: 0.000060 +12/10/2025 12:20:01 - INFO - __main__ - Step: 1190 Loss: 6.6926 LR: 0.000060 +12/10/2025 12:20:16 - INFO - __main__ - Step: 1200 Loss: 2.8797 LR: 0.000060 +12/10/2025 12:20:32 - INFO - __main__ - Step: 1210 Loss: 6.2622 LR: 0.000060 +12/10/2025 12:20:47 - INFO - __main__ - Step: 1220 Loss: 3.0145 LR: 0.000060 +12/10/2025 12:21:03 - INFO - __main__ - Step: 1230 Loss: 3.7550 LR: 0.000060 +12/10/2025 12:21:18 - INFO - __main__ - Step: 1240 Loss: 3.6606 LR: 0.000060 +12/10/2025 12:21:34 - INFO - __main__ - Step: 1250 Loss: 2.9642 LR: 0.000060 +12/10/2025 12:21:50 - INFO - __main__ - Step: 1260 Loss: 3.9318 LR: 0.000060 +12/10/2025 12:22:06 - INFO - __main__ - Step: 1270 Loss: 2.6559 LR: 0.000060 +12/10/2025 12:22:23 - INFO - __main__ - Step: 1280 Loss: 5.4312 LR: 0.000060 +12/10/2025 12:22:39 - INFO - __main__ - Step: 1290 Loss: 3.5359 LR: 0.000060 +12/10/2025 12:22:56 - INFO - __main__ - Step: 1300 Loss: 4.5748 LR: 0.000060 +12/10/2025 12:23:13 - INFO - __main__ - Step: 1310 Loss: 4.7353 LR: 0.000060 +12/10/2025 12:23:29 - INFO - __main__ - Step: 1320 Loss: 2.0193 LR: 0.000060 +12/10/2025 12:23:45 - INFO - __main__ - Step: 1330 Loss: 7.2324 LR: 0.000060 +12/10/2025 12:24:01 - INFO - __main__ - Step: 1340 Loss: 3.9969 LR: 0.000060 +12/10/2025 12:24:17 - INFO - __main__ - Step: 1350 Loss: 4.6693 LR: 0.000060 +12/10/2025 12:24:33 - INFO - __main__ - Step: 1360 Loss: 3.2349 LR: 0.000060 +12/10/2025 12:24:49 - INFO - __main__ - Step: 1370 Loss: 5.9346 LR: 0.000060 +12/10/2025 12:25:05 - INFO - __main__ - Step: 1380 Loss: 3.4152 LR: 0.000060 +12/10/2025 12:25:22 - INFO - __main__ - Step: 1390 Loss: 3.6506 LR: 0.000060 +12/10/2025 12:25:38 - INFO - __main__ - Step: 1400 Loss: 4.7416 LR: 0.000060 +12/10/2025 12:25:53 - INFO - __main__ - Step: 1410 Loss: 1.4305 LR: 0.000060 +12/10/2025 12:26:09 - INFO - __main__ - Step: 1420 Loss: 6.6906 LR: 0.000060 +12/10/2025 12:26:26 - INFO - __main__ - Step: 1430 Loss: 5.7471 LR: 0.000060 +12/10/2025 12:26:42 - INFO - __main__ - Step: 1440 Loss: 3.6354 LR: 0.000060 +12/10/2025 12:26:58 - INFO - __main__ - Step: 1450 Loss: 3.8425 LR: 0.000060 +12/10/2025 12:27:13 - INFO - __main__ - Step: 1460 Loss: 2.7156 LR: 0.000060 +12/10/2025 12:27:29 - INFO - __main__ - Step: 1470 Loss: 2.9668 LR: 0.000060 +12/10/2025 12:27:44 - INFO - __main__ - Step: 1480 Loss: 4.7851 LR: 0.000060 +12/10/2025 12:27:59 - INFO - __main__ - Step: 1490 Loss: 4.0824 LR: 0.000060 +12/10/2025 12:28:15 - INFO - __main__ - Step: 1500 Loss: 7.5225 LR: 0.000060 +12/10/2025 12:28:15 - INFO - accelerate.accelerator - Saving current state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500 +12/10/2025 12:28:22 - INFO - accelerate.checkpointing - Optimizer state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/optimizer.bin +12/10/2025 12:28:22 - INFO - accelerate.checkpointing - Scheduler state saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/scheduler.bin +12/10/2025 12:28:22 - INFO - accelerate.checkpointing - Sampler state for dataloader 0 saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/sampler.bin +12/10/2025 12:28:22 - INFO - accelerate.checkpointing - Random states saved in output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500/random_states_0.pkl +12/10/2025 12:28:22 - INFO - __main__ - Saved state to output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio/checkpoint-1500 +12/10/2025 12:28:22 - INFO - __main__ - Generating videos for validation... +12/10/2025 12:28:22 - INFO - __main__ - Loading text encoder and video tokenizer for validation... +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 97.64it/s] +Fetching 7 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3384.45it/s] +12/10/2025 12:28:38 - INFO - __main__ - Text encoder and video tokenizer loaded for validation +12/10/2025 12:28:38 - INFO - __main__ - Generating videos for validation... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:05<00:00, 8.16it/s] +12/10/2025 12:28:45 - INFO - __main__ - Validation videos saved to ./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio +12/10/2025 12:28:45 - INFO - __main__ - Cleaned up validation models and freed GPU memory +12/10/2025 12:29:01 - INFO - __main__ - Step: 1510 Loss: 3.9462 LR: 0.000060 +12/10/2025 12:29:16 - INFO - __main__ - Step: 1520 Loss: 3.0234 LR: 0.000060 +12/10/2025 12:29:32 - INFO - __main__ - Step: 1530 Loss: 3.3499 LR: 0.000060 +12/10/2025 12:29:48 - INFO - __main__ - Step: 1540 Loss: 6.9508 LR: 0.000060 +12/10/2025 12:30:05 - INFO - __main__ - Step: 1550 Loss: 3.1735 LR: 0.000060 +12/10/2025 12:30:22 - INFO - __main__ - Step: 1560 Loss: 3.1230 LR: 0.000060 +12/10/2025 12:30:37 - INFO - __main__ - Step: 1570 Loss: 2.6245 LR: 0.000060 +12/10/2025 12:30:53 - INFO - __main__ - Step: 1580 Loss: 4.4095 LR: 0.000060 +12/10/2025 12:31:08 - INFO - __main__ - Step: 1590 Loss: 3.9888 LR: 0.000060 +12/10/2025 12:31:23 - INFO - __main__ - Step: 1600 Loss: 2.9593 LR: 0.000060 +12/10/2025 12:31:38 - INFO - __main__ - Step: 1610 Loss: 4.4038 LR: 0.000060 +12/10/2025 12:31:54 - INFO - __main__ - Step: 1620 Loss: 4.3135 LR: 0.000060 +12/10/2025 12:32:11 - INFO - __main__ - Step: 1630 Loss: 3.0804 LR: 0.000060 +12/10/2025 12:32:27 - INFO - __main__ - Step: 1640 Loss: 1.9102 LR: 0.000060 +12/10/2025 12:32:44 - INFO - __main__ - Step: 1650 Loss: 2.6062 LR: 0.000060 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/requirements.txt b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf356027904d253cf8d99c729116710476866e7d --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/requirements.txt @@ -0,0 +1,124 @@ +typing-inspection==0.4.2 +dask==2025.11.0 +dill==0.4.0 +ffmpy==1.0.0 +xxhash==3.6.0 +partd==1.4.2 +brotli==1.2.0 +einops==0.8.1 +charset-normalizer==3.4.4 +tokenizers==0.22.1 +aiohappyeyeballs==2.6.1 +python-dateutil==2.9.0.post0 +pyarrow==22.0.0 +annotated-types==0.7.0 +GitPython==3.1.45 +rich==14.2.0 +nvidia-cufile-cu12==1.13.1.3 +nvidia-nvshmem-cu12==3.3.20 +orjson==3.11.5 +pydantic_core==2.41.5 +groovy==0.1.2 +psutil==7.1.3 +peft==0.18.0 +typing_extensions==4.15.0 +wcwidth==0.2.14 +MarkupSafe==3.0.3 +protobuf==6.33.2 +typer==0.20.0 +gradio==6.1.0 +sentry-sdk==2.47.0 +nvidia-cusparselt-cu12==0.7.1 +locket==1.0.0 +PyYAML==6.0.3 +nvidia-nvjitlink-cu12==12.8.93 +datasets==4.4.1 +pytorch-lightning==2.6.0 +frozenlist==1.8.0 +pydub==0.25.1 +huggingface-hub==0.36.0 +Pygments==2.19.2 +aiohttp==3.13.2 +torchvision==0.24.1 +fastapi==0.124.0 +cloudpickle==3.1.2 +wandb==0.23.1 +tqdm==4.67.1 +httpx==0.28.1 +open_clip_torch==3.2.0 +flash_attn==2.8.3 +mdurl==0.1.2 +pandas==2.3.3 +toolz==1.1.0 +ftfy==6.3.1 +platformdirs==4.5.1 +transformers==4.57.3 +requests==2.32.5 +pytz==2025.2 +Jinja2==3.1.6 +click==8.3.1 +attrs==25.4.0 +hf-xet==1.2.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-runtime-cu12==12.8.90 +bitsandbytes==0.48.2 +lightning-utilities==0.15.2 +wheel==0.45.1 +timm==1.0.22 +semantic-version==2.10.0 +triton==3.5.1 +nvidia-cublas-cu12==12.8.4.1 +httpcore==1.0.9 +fsspec==2025.10.0 +multidict==6.7.0 +regex==2025.11.3 +gradio_client==2.0.1 +importlib_metadata==8.7.0 +anyio==4.12.0 +nvidia-cusolver-cu12==11.7.3.90 +filelock==3.20.0 +torchmetrics==1.8.2 +numpy==2.3.5 +uvicorn==0.38.0 +packaging==25.0 +aiosignal==1.4.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +tzdata==2025.2 +urllib3==2.6.1 +networkx==3.6.1 +setuptools==80.9.0 +sympy==1.14.0 +torch==2.9.1 +nvidia-cuda-cupti-cu12==12.8.90 +smmap==5.0.2 +safetensors==0.7.0 +gitdb==4.0.12 +safehttpx==0.1.7 +nvidia-cusparse-cu12==12.5.8.93 +multiprocess==0.70.18 +pillow==12.0.0 +h11==0.16.0 +certifi==2025.11.12 +idna==3.11 +nvidia-nvtx-cu12==12.8.90 +markdown-it-py==4.0.0 +six==1.17.0 +starlette==0.50.0 +audioop-lts==0.2.2 +accelerate==1.12.0 +diffusers==0.36.0 +annotated-doc==0.0.4 +zipp==3.23.0 +propcache==0.4.1 +mpmath==1.3.0 +sentencepiece==0.2.1 +nvidia-curand-cu12==10.3.9.90 +python-multipart==0.0.20 +nvidia-cufft-cu12==11.3.3.83 +pip==25.3 +aiofiles==24.1.0 +yarl==1.22.0 +nvidia-cudnn-cu12==9.10.2.21 +tomlkit==0.13.3 +pydantic==2.12.4 diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/wandb-metadata.json b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..35053deba575cc499331b8e547a9c6bde5ed5bde --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/files/wandb-metadata.json @@ -0,0 +1,156 @@ +{ + "os": "Linux-6.8.0-1027-aws-x86_64-with-glibc2.35", + "python": "CPython 3.13.10", + "startedAt": "2025-12-10T11:44:39.581540Z", + "args": [ + "--use_precomputed_features", + "--features_dir", + "/mnt/VideoGen/dataset/OpenVid1M/extracted_features", + "--text_encoder_architecture", + "umt5-xxl", + "--wan_pretrained_path", + "Wan-AI/Wan2.1-T2V-1.3B", + "--training_from_scratch", + "True", + "--pretrained_model_name_or_path", + "dummy", + "--wan_backbone_lr_ratio", + "0.2", + "--num_frames", + "17", + "--video_height", + "128", + "--video_width", + "128", + "--dataloader_num_workers", + "8", + "--video_tokenizer_model_id", + "Cosmos-0.1-Tokenizer-DV4x8x8", + "--instance_dataset", + "OpenVid1MDataset", + "--instance_data_dir", + "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv", + "--train_batch_size", + "8", + "--gradient_accumulation_steps", + "4", + "--learning_rate", + "3e-4", + "--max_train_steps", + "10000", + "--checkpointing_steps", + "500", + "--validation_steps", + "500", + "--logging_steps", + "10", + "--validation_prompts", + "a cat playing", + "a girl walking", + "--output_dir", + "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio", + "--mixed_precision", + "bf16", + "--lr_scheduler", + "constant", + "--lr_warmup_steps", + "0", + "--use_8bit_adam", + "--gradient_checkpointing", + "--min_masking_rate", + "0.0", + "--cond_dropout_prob", + "0.1", + "--split_vae_encode", + "1", + "--allow_tf32", + "--seed", + "42", + "--report_to", + "wandb" + ], + "program": "/mnt/Meissonic/train/train_mei_video.py", + "codePath": "train/train_mei_video.py", + "codePathLocal": "train/train_mei_video.py", + "git": { + "remote": "https://github.com/viiika/Meissonic.git", + "commit": "6819d374ef1b86bdedad373aab1121a89687e5cf" + }, + "email": "jinbin5bai@gmail.com", + "root": "/mnt/Meissonic", + "host": "ip-172-31-91-136", + "executable": "/home/ubuntu/miniconda3/envs/test/bin/python3.13", + "cpu_count": 48, + "cpu_count_logical": 96, + "gpu": "NVIDIA A100-SXM4-40GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "16650112278528", + "used": "12200016330752" + } + }, + "memory": { + "total": "1204521451520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-54a50f05-7a41-8b8e-59c5-e1774ec42215" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-71102f28-cd17-57e7-6181-120bf743d23d" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-57dfac44-bb50-f9b6-1534-27fbe79dfd87" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-f55652c0-bdaf-e7bb-a876-8fce14c3f879" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-303ab142-3206-9a14-c758-58ab97d7510e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1adf5c34-24d0-c5e2-b33b-783100bbd6c3" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-5b4a0e50-96a5-74bd-f595-14de5614cc6e" + }, + { + "name": "NVIDIA A100-SXM4-40GB", + "memoryTotal": "42949672960", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-efb2d1fc-1eed-653d-ed51-5273085154ba" + } + ], + "cudaVersion": "12.8", + "writerId": "e7k4synno5s6446cv7tglm0fogbbud44" +} \ No newline at end of file diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-core.log b/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..1a9cdf779b238f7720ec7dac5b49a19502c81560 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-12-10T11:44:39.658001682Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp_j5j81dh/port-1312594.txt","pid":1312594,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-10T11:44:39.658502262Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":1312594} +{"time":"2025-12-10T11:44:39.658508217Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-1312594-1312849-1745419067/socket","Net":"unix"}} +{"time":"2025-12-10T11:44:39.84508562Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-10T11:44:39.851289376Z","level":"INFO","msg":"handleInformInit: received","streamId":"mrtah7xe","id":"1(@)"} +{"time":"2025-12-10T11:44:40.016807865Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"mrtah7xe","id":"1(@)"} +{"time":"2025-12-10T12:33:00.214923867Z","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-internal.log b/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d8dbd8f1d87e2a1194f2fe70f37102cb3ea4a565 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2025-12-10T11:44:39.85137573Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-10T11:44:40.016600149Z","level":"INFO","msg":"stream: created new stream","id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016677758Z","level":"INFO","msg":"handler: started","stream_id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016799939Z","level":"INFO","msg":"stream: started","id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016877013Z","level":"INFO","msg":"writer: started","stream_id":"mrtah7xe"} +{"time":"2025-12-10T11:44:40.016880098Z","level":"INFO","msg":"sender: started","stream_id":"mrtah7xe"} diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug.log b/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2c28c8a071776362325b31a769670028f231f277 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug.log @@ -0,0 +1,22 @@ +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Configure stats pid to 1312594 +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings +2025-12-10 11:44:39,582 INFO MainThread:1312594 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug.log +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251210_114439-mrtah7xe/logs/debug-internal.log +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:init():841] calling init triggers +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-12-10 11:44:39,583 INFO MainThread:1312594 [wandb_init.py:init():889] starting backend +2025-12-10 11:44:39,845 INFO MainThread:1312594 [wandb_init.py:init():892] sending inform_init request +2025-12-10 11:44:39,849 INFO MainThread:1312594 [wandb_init.py:init():900] backend started and connected +2025-12-10 11:44:39,851 INFO MainThread:1312594 [wandb_init.py:init():970] updated telemetry +2025-12-10 11:44:39,855 INFO MainThread:1312594 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-10 11:44:40,285 INFO MainThread:1312594 [wandb_init.py:init():1041] starting run threads in backend +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_console_start():2521] atexit reg +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-10 11:44:40,377 INFO MainThread:1312594 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-10 11:44:40,380 INFO MainThread:1312594 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-10 11:44:40,380 INFO MainThread:1312594 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 10000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'learning_rate': 0.0003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 500, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking'], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.1, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': 'Wan-AI/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features', 'empty_embeds_path': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features/empty_embeds.npy'} diff --git a/Meissonic/wandb/run-20251210_114439-mrtah7xe/run-mrtah7xe.wandb b/Meissonic/wandb/run-20251210_114439-mrtah7xe/run-mrtah7xe.wandb new file mode 100644 index 0000000000000000000000000000000000000000..23b1a1e532997f949d7db7da4dc6651dd4315210 --- /dev/null +++ b/Meissonic/wandb/run-20251210_114439-mrtah7xe/run-mrtah7xe.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df857026859ba42c655fcb92a570c4f9ef3fc41929d3a3a1eeaf86de7eb8256f +size 851968