BryanW commited on
Commit
3d1c0e1
·
verified ·
1 Parent(s): cad164d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +275 -0
  2. Meissonic/.github/FUNDING.yml +15 -0
  3. Meissonic/.gitignore +166 -0
  4. Meissonic/InfinityStar/.gitignore +59 -0
  5. Meissonic/InfinityStar/LICENSE +21 -0
  6. Meissonic/InfinityStar/README.md +187 -0
  7. Meissonic/InfinityStar/__pycache__/train.cpython-310.pyc +0 -0
  8. Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png +3 -0
  9. Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png +3 -0
  10. Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png +3 -0
  11. Meissonic/InfinityStar/assets/framework.png +3 -0
  12. Meissonic/InfinityStar/assets/i2v_examples.png +3 -0
  13. Meissonic/InfinityStar/assets/logo.png +3 -0
  14. Meissonic/InfinityStar/assets/reference_image.webp +3 -0
  15. Meissonic/InfinityStar/assets/supp_show_images.png +3 -0
  16. Meissonic/InfinityStar/assets/v2v_examples.png +3 -0
  17. Meissonic/InfinityStar/cog.yaml +46 -0
  18. Meissonic/InfinityStar/data/README.md +57 -0
  19. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0001_0010_000000100.jsonl +0 -0
  20. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0002_0010_000000100.jsonl +0 -0
  21. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0003_0010_000000100.jsonl +0 -0
  22. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0004_0010_000000100.jsonl +0 -0
  23. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0005_0010_000000100.jsonl +0 -0
  24. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0006_0010_000000100.jsonl +0 -0
  25. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0007_0010_000000100.jsonl +0 -0
  26. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0008_0010_000000100.jsonl +0 -0
  27. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0009_0010_000000100.jsonl +0 -0
  28. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0010_0010_000000100.jsonl +0 -0
  29. Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls_for_training.py +118 -0
  30. Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 +3 -0
  31. Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 +3 -0
  32. Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/prompt.txt +4 -0
  33. Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 +3 -0
  34. Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/prompt.txt +4 -0
  35. Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 +3 -0
  36. Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/prompt.txt +4 -0
  37. Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 +3 -0
  38. Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/prompt.txt +4 -0
  39. Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 +3 -0
  40. Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/prompt.txt +4 -0
  41. Meissonic/InfinityStar/evaluation/README.md +2 -0
  42. Meissonic/InfinityStar/evaluation/VBench_rewrited_prompt.json +0 -0
  43. Meissonic/InfinityStar/infinity/__init__.py +2 -0
  44. Meissonic/InfinityStar/infinity/__pycache__/__init__.cpython-310.pyc +0 -0
  45. Meissonic/InfinityStar/infinity/dataset/__pycache__/build.cpython-310.pyc +0 -0
  46. Meissonic/InfinityStar/infinity/dataset/__pycache__/dataset_joint_vi.cpython-310.pyc +0 -0
  47. Meissonic/InfinityStar/infinity/dataset/build.py +218 -0
  48. Meissonic/InfinityStar/infinity/dataset/dataset_joint_vi.py +689 -0
  49. Meissonic/InfinityStar/infinity/models/__init__.py +29 -0
  50. Meissonic/InfinityStar/infinity/models/__pycache__/__init__.cpython-310.pyc +0 -0
.gitattributes CHANGED
@@ -34,3 +34,278 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  OpenVid1M/video_reorg/OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  OpenVid1M/video_reorg/OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text
37
+ Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png filter=lfs diff=lfs merge=lfs -text
38
+ Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png filter=lfs diff=lfs merge=lfs -text
39
+ Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png filter=lfs diff=lfs merge=lfs -text
40
+ Meissonic/InfinityStar/assets/framework.png filter=lfs diff=lfs merge=lfs -text
41
+ Meissonic/InfinityStar/assets/i2v_examples.png filter=lfs diff=lfs merge=lfs -text
42
+ Meissonic/InfinityStar/assets/logo.png filter=lfs diff=lfs merge=lfs -text
43
+ Meissonic/InfinityStar/assets/reference_image.webp filter=lfs diff=lfs merge=lfs -text
44
+ Meissonic/InfinityStar/assets/supp_show_images.png filter=lfs diff=lfs merge=lfs -text
45
+ Meissonic/InfinityStar/assets/v2v_examples.png filter=lfs diff=lfs merge=lfs -text
46
+ Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 filter=lfs diff=lfs merge=lfs -text
52
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_0.png filter=lfs diff=lfs merge=lfs -text
53
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_1.png filter=lfs diff=lfs merge=lfs -text
54
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_2.png filter=lfs diff=lfs merge=lfs -text
55
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_grid_video_3.png filter=lfs diff=lfs merge=lfs -text
56
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_0.mp4 filter=lfs diff=lfs merge=lfs -text
57
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ Meissonic/InfinityStar/infinity_vqvae_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text
60
+ Meissonic/InfinityStar/vae_reconstruction_test/comparison.mp4 filter=lfs diff=lfs merge=lfs -text
61
+ Meissonic/InfinityStar/vae_reconstruction_test/comparison_grid.png filter=lfs diff=lfs merge=lfs -text
62
+ Meissonic/InfinityStar/vae_reconstruction_test/frame_000_comparison.png filter=lfs diff=lfs merge=lfs -text
63
+ Meissonic/InfinityStar/vae_reconstruction_test/frame_001_comparison.png filter=lfs diff=lfs merge=lfs -text
64
+ Meissonic/InfinityStar/vae_reconstruction_test/frame_002_comparison.png filter=lfs diff=lfs merge=lfs -text
65
+ Meissonic/InfinityStar/vae_reconstruction_test/frame_003_comparison.png filter=lfs diff=lfs merge=lfs -text
66
+ Meissonic/InfinityStar/vae_reconstruction_test/frame_004_comparison.png filter=lfs diff=lfs merge=lfs -text
67
+ Meissonic/VidTok/assets/example.mp4 filter=lfs diff=lfs merge=lfs -text
68
+ Meissonic/VidTok/assets/radar.png filter=lfs diff=lfs merge=lfs -text
69
+ Meissonic/VidTok/assets/vidtwin.png filter=lfs diff=lfs merge=lfs -text
70
+ Meissonic/VidTok/assets/vidtwin_demo.png filter=lfs diff=lfs merge=lfs -text
71
+ Meissonic/VidTok/vidtok_cache/VidTok/assets/example.mp4 filter=lfs diff=lfs merge=lfs -text
72
+ Meissonic/VidTok/vidtok_cache/VidTok/assets/radar.png filter=lfs diff=lfs merge=lfs -text
73
+ Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin.png filter=lfs diff=lfs merge=lfs -text
74
+ Meissonic/VidTok/vidtok_cache/VidTok/assets/vidtwin_demo.png filter=lfs diff=lfs merge=lfs -text
75
+ Meissonic/VidTok/vidtok_test_output/comparison_grid_video_0.png filter=lfs diff=lfs merge=lfs -text
76
+ Meissonic/VidTok/vidtok_test_output/comparison_grid_video_1.png filter=lfs diff=lfs merge=lfs -text
77
+ Meissonic/VidTok/vidtok_test_output/comparison_grid_video_2.png filter=lfs diff=lfs merge=lfs -text
78
+ Meissonic/VidTok/vidtok_test_output/comparison_grid_video_3.png filter=lfs diff=lfs merge=lfs -text
79
+ Meissonic/VidTok/vidtok_test_output/comparison_video_0.mp4 filter=lfs diff=lfs merge=lfs -text
80
+ Meissonic/VidTok/vidtok_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
81
+ Meissonic/VidTok/vidtok_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text
82
+ Meissonic/VidTok/vidtok_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text
83
+ Meissonic/assets/architecture.png filter=lfs diff=lfs merge=lfs -text
84
+ Meissonic/assets/demos.pdf filter=lfs diff=lfs merge=lfs -text
85
+ Meissonic/assets/demos.png filter=lfs diff=lfs merge=lfs -text
86
+ Meissonic/assets/inpaint/0eKR4M2uuL8.jpg filter=lfs diff=lfs merge=lfs -text
87
+ Meissonic/assets/inpaint/__Owak0IgJk.jpg filter=lfs diff=lfs merge=lfs -text
88
+ Meissonic/assets/outpaint/__G2yFuW7jQ.jpg filter=lfs diff=lfs merge=lfs -text
89
+ Meissonic/cosmos_test_output/comparison_grid_video_0.png filter=lfs diff=lfs merge=lfs -text
90
+ Meissonic/cosmos_test_output/comparison_grid_video_1.png filter=lfs diff=lfs merge=lfs -text
91
+ Meissonic/cosmos_test_output/comparison_grid_video_2.png filter=lfs diff=lfs merge=lfs -text
92
+ Meissonic/cosmos_test_output/comparison_grid_video_3.png filter=lfs diff=lfs merge=lfs -text
93
+ Meissonic/cosmos_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
94
+ Meissonic/cosmos_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text
95
+ Meissonic/cosmos_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text
96
+ Meissonic/output/9_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
97
+ Meissonic/output/9_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
98
+ Meissonic/output/A[[:space:]]black[[:space:]]an_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
99
+ Meissonic/output/A[[:space:]]cat[[:space:]]wear_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
100
+ Meissonic/output/A[[:space:]]dog[[:space:]]in[[:space:]]a_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
101
+ Meissonic/output/A[[:space:]]large[[:space:]]bo_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
102
+ Meissonic/output/A[[:space:]]robot[[:space:]]pl_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
103
+ Meissonic/output/A[[:space:]]white[[:space:]]an_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
104
+ Meissonic/output/The[[:space:]]sun[[:space:]]is_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
105
+ Meissonic/output/Three[[:space:]]boat_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
106
+ Meissonic/output/Two[[:space:]]actors_1024_64_9_fp8.png filter=lfs diff=lfs merge=lfs -text
107
+ Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
108
+ Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
109
+ Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
110
+ Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
111
+ Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
112
+ Meissonic/output_128x128_17f_2\*4bs_4\*8\*8vqvae_0_2_ratio/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
113
+ Meissonic/output_180x320_16f_2bs_4\*8\*8vqvae_0_2_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
114
+ Meissonic/output_180x320_16f_2bs_4\*8\*8vqvae_0_2_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
115
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
116
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
117
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
118
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/1999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
119
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
120
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
121
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
122
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/2999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
123
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
124
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
125
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
126
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/3999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
127
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
128
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
129
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
130
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
131
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
132
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
133
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
134
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
135
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
136
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
137
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
138
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
139
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/1999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
140
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
141
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
142
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
143
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/2999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
144
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/3499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
145
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/3499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
146
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
147
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
148
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
149
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_00_ratio_continue_tmp/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
150
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
151
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
152
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
153
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/1999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
154
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
155
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
156
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
157
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/2999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
158
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
159
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
160
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
161
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/3999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
162
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
163
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
164
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
165
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/4999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
166
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
167
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
168
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
169
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
170
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
171
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/5999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
172
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
173
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
174
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
175
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/6999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
176
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
177
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
178
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
179
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/7999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
180
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/8499_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
181
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/8999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
182
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/8999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
183
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/9499_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
184
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/9999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
185
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/9999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
186
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/999_video_0_CFG-9.png filter=lfs diff=lfs merge=lfs -text
187
+ Meissonic/output_256x448_4f_2bs_4\*8\*8vqvae_0_10_ratio/999_video_1_CFG-9.png filter=lfs diff=lfs merge=lfs -text
188
+ Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/autoencoder.jit filter=lfs diff=lfs merge=lfs -text
189
+ Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/decoder.jit filter=lfs diff=lfs merge=lfs -text
190
+ Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV4x8x8/encoder.jit filter=lfs diff=lfs merge=lfs -text
191
+ Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/autoencoder.jit filter=lfs diff=lfs merge=lfs -text
192
+ Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/decoder.jit filter=lfs diff=lfs merge=lfs -text
193
+ Meissonic/pretrained_ckpts/Cosmos-0.1-Tokenizer-DV8x8x8/encoder.jit filter=lfs diff=lfs merge=lfs -text
194
+ Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/autoencoder.jit filter=lfs diff=lfs merge=lfs -text
195
+ Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/decoder.jit filter=lfs diff=lfs merge=lfs -text
196
+ Meissonic/pretrained_ckpts/Cosmos-1.0-Tokenizer-DV8x16x16/encoder.jit filter=lfs diff=lfs merge=lfs -text
197
+ Meissonic/vidtok_cache/VidTok/assets/example.mp4 filter=lfs diff=lfs merge=lfs -text
198
+ Meissonic/vidtok_cache/VidTok/assets/radar.png filter=lfs diff=lfs merge=lfs -text
199
+ Meissonic/vidtok_cache/VidTok/assets/vidtwin.png filter=lfs diff=lfs merge=lfs -text
200
+ Meissonic/vidtok_cache/VidTok/assets/vidtwin_demo.png filter=lfs diff=lfs merge=lfs -text
201
+ Meissonic/wandb/run-20251207_092554-l16v7o9l/run-l16v7o9l.wandb filter=lfs diff=lfs merge=lfs -text
202
+ Meissonic/wandb/run-20251207_094329-qf4q6gjw/run-qf4q6gjw.wandb filter=lfs diff=lfs merge=lfs -text
203
+ Meissonic/wandb/run-20251207_094715-uvgb9hvt/run-uvgb9hvt.wandb filter=lfs diff=lfs merge=lfs -text
204
+ Meissonic/wandb/run-20251207_102454-nnww5mz8/run-nnww5mz8.wandb filter=lfs diff=lfs merge=lfs -text
205
+ Meissonic/wandb/run-20251207_111518-slrbepi0/run-slrbepi0.wandb filter=lfs diff=lfs merge=lfs -text
206
+ Meissonic/wandb/run-20251207_113103-ijl2gw6b/run-ijl2gw6b.wandb filter=lfs diff=lfs merge=lfs -text
207
+ Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_2f39bee6c4969d94f6d2.png filter=lfs diff=lfs merge=lfs -text
208
+ Meissonic/wandb/run-20251207_113607-aryc95f2/files/media/images/generated_videos_first_frame_10_a0ddb52b457bceac4774.png filter=lfs diff=lfs merge=lfs -text
209
+ Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_1fc345a8cdc18e62468b.png filter=lfs diff=lfs merge=lfs -text
210
+ Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_1000_f4b36308698e96e11163.png filter=lfs diff=lfs merge=lfs -text
211
+ Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_0798147230daa742b054.png filter=lfs diff=lfs merge=lfs -text
212
+ Meissonic/wandb/run-20251207_114426-5sh31nrg/files/media/images/generated_videos_first_frame_500_aed08910c4a8dcdc87f6.png filter=lfs diff=lfs merge=lfs -text
213
+ Meissonic/wandb/run-20251207_114426-5sh31nrg/run-5sh31nrg.wandb filter=lfs diff=lfs merge=lfs -text
214
+ Meissonic/wandb/run-20251207_162442-54o4hegd/run-54o4hegd.wandb filter=lfs diff=lfs merge=lfs -text
215
+ Meissonic/wandb/run-20251208_032955-tl61pd0t/run-tl61pd0t.wandb filter=lfs diff=lfs merge=lfs -text
216
+ Meissonic/wandb/run-20251208_040606-2dcjc9k8/run-2dcjc9k8.wandb filter=lfs diff=lfs merge=lfs -text
217
+ Meissonic/wandb/run-20251208_062741-qalkbn80/run-qalkbn80.wandb filter=lfs diff=lfs merge=lfs -text
218
+ Meissonic/wandb/run-20251208_071823-0hjx73rw/run-0hjx73rw.wandb filter=lfs diff=lfs merge=lfs -text
219
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_8328d2d0556a95ff2759.png filter=lfs diff=lfs merge=lfs -text
220
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_10000_980ee3261a5cf9cce942.png filter=lfs diff=lfs merge=lfs -text
221
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_8fd26361f0705a90a632.png filter=lfs diff=lfs merge=lfs -text
222
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1000_cec203cb5c36d2873217.png filter=lfs diff=lfs merge=lfs -text
223
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_c061c65a6ce343b1660e.png filter=lfs diff=lfs merge=lfs -text
224
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_1500_f047fb97b642dc30b33c.png filter=lfs diff=lfs merge=lfs -text
225
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_2805ac51dfa6ef4de083.png filter=lfs diff=lfs merge=lfs -text
226
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2000_e98ce360ce92d75f9a36.png filter=lfs diff=lfs merge=lfs -text
227
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_430592107b01c838d952.png filter=lfs diff=lfs merge=lfs -text
228
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_2500_ecca4db815beca263f13.png filter=lfs diff=lfs merge=lfs -text
229
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_52422bc6ab7caedd5b8c.png filter=lfs diff=lfs merge=lfs -text
230
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3000_8e35b9b7d6b7a0806553.png filter=lfs diff=lfs merge=lfs -text
231
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_227067a6cd64b7cdced4.png filter=lfs diff=lfs merge=lfs -text
232
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_3500_55ba9221da0bf3c49190.png filter=lfs diff=lfs merge=lfs -text
233
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_9a50d3903fd31767c616.png filter=lfs diff=lfs merge=lfs -text
234
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4000_ffacfcca81b53cb27319.png filter=lfs diff=lfs merge=lfs -text
235
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_935711ba29b3ab613691.png filter=lfs diff=lfs merge=lfs -text
236
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_4500_bf885e1339d92cc386d1.png filter=lfs diff=lfs merge=lfs -text
237
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_bdd3a8c8c0c8a7a7d4dd.png filter=lfs diff=lfs merge=lfs -text
238
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5000_c8333d970fbc70e45c64.png filter=lfs diff=lfs merge=lfs -text
239
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_3d483725c07baf8663d3.png filter=lfs diff=lfs merge=lfs -text
240
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_500_b0f06ea56e9a9c08850c.png filter=lfs diff=lfs merge=lfs -text
241
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_60d433cf43a3cb8d1412.png filter=lfs diff=lfs merge=lfs -text
242
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_5500_7cd8c962e4d1b79b5dcc.png filter=lfs diff=lfs merge=lfs -text
243
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_41402987f48490139945.png filter=lfs diff=lfs merge=lfs -text
244
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6000_c6c41d57fcadc12fd69b.png filter=lfs diff=lfs merge=lfs -text
245
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_2d21e8a2ea1688bffb9d.png filter=lfs diff=lfs merge=lfs -text
246
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_6500_a609810c96cec2279a46.png filter=lfs diff=lfs merge=lfs -text
247
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_4a1fe2fe98784f7b8841.png filter=lfs diff=lfs merge=lfs -text
248
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7000_6119b9f39242430c319b.png filter=lfs diff=lfs merge=lfs -text
249
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_7e0ee18074e9b8d85c45.png filter=lfs diff=lfs merge=lfs -text
250
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_7500_b01a808f5a897296f898.png filter=lfs diff=lfs merge=lfs -text
251
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_15555bb3e2ce8b16ddcf.png filter=lfs diff=lfs merge=lfs -text
252
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8000_9652b904aa757dce7aeb.png filter=lfs diff=lfs merge=lfs -text
253
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_8500_c2d1b91c197ca101b350.png filter=lfs diff=lfs merge=lfs -text
254
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_03ba3747205343bd9935.png filter=lfs diff=lfs merge=lfs -text
255
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9000_cc8a6153b15016f58ad3.png filter=lfs diff=lfs merge=lfs -text
256
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/files/media/images/generated_videos_first_frame_9500_342589ce9380e8bb866b.png filter=lfs diff=lfs merge=lfs -text
257
+ Meissonic/wandb/run-20251208_155943-j5rc8ish/run-j5rc8ish.wandb filter=lfs diff=lfs merge=lfs -text
258
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_4ea9441b252682155006.png filter=lfs diff=lfs merge=lfs -text
259
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1000_be5afcc9b61ce7cc9765.png filter=lfs diff=lfs merge=lfs -text
260
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_7c59a605f746fefa06f3.png filter=lfs diff=lfs merge=lfs -text
261
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_1500_e846322d8d1fe1da0c06.png filter=lfs diff=lfs merge=lfs -text
262
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_e7251adb287026b97ff8.png filter=lfs diff=lfs merge=lfs -text
263
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2000_fb353e591b1b0dbac386.png filter=lfs diff=lfs merge=lfs -text
264
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_4254c55c5a44dae8222b.png filter=lfs diff=lfs merge=lfs -text
265
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_2500_880fb5b7bb7d55a41102.png filter=lfs diff=lfs merge=lfs -text
266
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_0af47ac2b0fd0a7b83b9.png filter=lfs diff=lfs merge=lfs -text
267
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3000_38859ead3b87553090be.png filter=lfs diff=lfs merge=lfs -text
268
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_1b3f708ccf2664b9bd84.png filter=lfs diff=lfs merge=lfs -text
269
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_3500_96fc2c23d9374b5c001f.png filter=lfs diff=lfs merge=lfs -text
270
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_7f60fcf85257e0427cb4.png filter=lfs diff=lfs merge=lfs -text
271
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_4000_f36bf77eea280b84a34e.png filter=lfs diff=lfs merge=lfs -text
272
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_92b8064a4e25f8ad3702.png filter=lfs diff=lfs merge=lfs -text
273
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/files/media/images/generated_videos_first_frame_500_f6969510d28d905ce414.png filter=lfs diff=lfs merge=lfs -text
274
+ Meissonic/wandb/run-20251209_060856-ctbp97lz/run-ctbp97lz.wandb filter=lfs diff=lfs merge=lfs -text
275
+ Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_2a764e89458c3c8d15fb.png filter=lfs diff=lfs merge=lfs -text
276
+ Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1000_80cf7f467b6a4ea9a5d4.png filter=lfs diff=lfs merge=lfs -text
277
+ Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_1500_fb32391d5c492e093a1a.png filter=lfs diff=lfs merge=lfs -text
278
+ Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_9a388a1a15b60d9f4438.png filter=lfs diff=lfs merge=lfs -text
279
+ Meissonic/wandb/run-20251209_102651-55o5soqg/files/media/images/generated_videos_first_frame_500_c2c619bff47ae122a524.png filter=lfs diff=lfs merge=lfs -text
280
+ Meissonic/wandb/run-20251209_102651-55o5soqg/run-55o5soqg.wandb filter=lfs diff=lfs merge=lfs -text
281
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_7b2c7dbea7c77c3a3523.png filter=lfs diff=lfs merge=lfs -text
282
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1000_d3b01b8e129b539a85ed.png filter=lfs diff=lfs merge=lfs -text
283
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_287117d5d7643ba31ec4.png filter=lfs diff=lfs merge=lfs -text
284
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_1500_f6b18ba278e34d44baab.png filter=lfs diff=lfs merge=lfs -text
285
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_321720abba124381620b.png filter=lfs diff=lfs merge=lfs -text
286
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2000_fa7af054654656754134.png filter=lfs diff=lfs merge=lfs -text
287
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_e6c1efef5a74bd11c582.png filter=lfs diff=lfs merge=lfs -text
288
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_2500_f00b3e2c752ac3cf926a.png filter=lfs diff=lfs merge=lfs -text
289
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_67d5ba7897e123897b95.png filter=lfs diff=lfs merge=lfs -text
290
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3000_9c128d777c7dab549107.png filter=lfs diff=lfs merge=lfs -text
291
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_4274b237825ef8cf5d05.png filter=lfs diff=lfs merge=lfs -text
292
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_3500_de7aecbbb4729ab5af9d.png filter=lfs diff=lfs merge=lfs -text
293
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_09fa45bbfff36049e141.png filter=lfs diff=lfs merge=lfs -text
294
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/files/media/images/generated_videos_first_frame_500_d8fc778d368d5c2cb79c.png filter=lfs diff=lfs merge=lfs -text
295
+ Meissonic/wandb/run-20251209_141739-fk5kdvzr/run-fk5kdvzr.wandb filter=lfs diff=lfs merge=lfs -text
296
+ Meissonic/wandb/run-20251209_162337-uv3abozu/run-uv3abozu.wandb filter=lfs diff=lfs merge=lfs -text
297
+ Meissonic/wandb/run-20251210_030325-gkrz1ykg/run-gkrz1ykg.wandb filter=lfs diff=lfs merge=lfs -text
298
+ Meissonic/wandb/run-20251210_032745-o7so78o8/run-o7so78o8.wandb filter=lfs diff=lfs merge=lfs -text
299
+ Meissonic/wandb/run-20251210_035336-u8db4xs3/run-u8db4xs3.wandb filter=lfs diff=lfs merge=lfs -text
300
+ Meissonic/wandb/run-20251210_043009-5878wpml/run-5878wpml.wandb filter=lfs diff=lfs merge=lfs -text
301
+ Meissonic/wandb/run-20251210_045934-tcqz8xbx/run-tcqz8xbx.wandb filter=lfs diff=lfs merge=lfs -text
302
+ Meissonic/wandb/run-20251210_065438-svzut638/run-svzut638.wandb filter=lfs diff=lfs merge=lfs -text
303
+ Meissonic/wandb/run-20251210_071716-kc9aapl4/run-kc9aapl4.wandb filter=lfs diff=lfs merge=lfs -text
304
+ Meissonic/wandb/run-20251210_105833-im5q8jfr/run-im5q8jfr.wandb filter=lfs diff=lfs merge=lfs -text
305
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_654e84862d8c0a13f1b5.png filter=lfs diff=lfs merge=lfs -text
306
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1000_cd153009051a7d605018.png filter=lfs diff=lfs merge=lfs -text
307
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_56d7d215080b273e9155.png filter=lfs diff=lfs merge=lfs -text
308
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_1500_736c017ca88662cd1d11.png filter=lfs diff=lfs merge=lfs -text
309
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_4f69c990d95a223b9d06.png filter=lfs diff=lfs merge=lfs -text
310
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/files/media/images/generated_videos_grid_500_5fc1dbdaeeaef4847234.png filter=lfs diff=lfs merge=lfs -text
311
+ Meissonic/wandb/run-20251210_114439-mrtah7xe/run-mrtah7xe.wandb filter=lfs diff=lfs merge=lfs -text
Meissonic/.github/FUNDING.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # These are supported funding model platforms
2
+
3
+ github: viiika
4
+ patreon: # Replace with a single Patreon username
5
+ open_collective: # Replace with a single Open Collective username
6
+ ko_fi: # Replace with a single Ko-fi username
7
+ tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8
+ community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9
+ liberapay: # Replace with a single Liberapay username
10
+ issuehunt: # Replace with a single IssueHunt username
11
+ lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12
+ polar: # Replace with a single Polar username
13
+ buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
14
+ thanks_dev: # Replace with a single thanks.dev username
15
+ custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
Meissonic/.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Mac OS related
40
+ .DS_Store
41
+ *.DS_Store
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
114
+ .pdm.toml
115
+ .pdm-python
116
+ .pdm-build/
117
+
118
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119
+ __pypackages__/
120
+
121
+ # Celery stuff
122
+ celerybeat-schedule
123
+ celerybeat.pid
124
+
125
+ # SageMath parsed files
126
+ *.sage.py
127
+
128
+ # Environments
129
+ .env
130
+ .venv
131
+ env/
132
+ venv/
133
+ ENV/
134
+ env.bak/
135
+ venv.bak/
136
+
137
+ # Spyder project settings
138
+ .spyderproject
139
+ .spyproject
140
+
141
+ # Rope project settings
142
+ .ropeproject
143
+
144
+ # mkdocs documentation
145
+ /site
146
+
147
+ # mypy
148
+ .mypy_cache/
149
+ .dmypy.json
150
+ dmypy.json
151
+
152
+ # Pyre type checker
153
+ .pyre/
154
+
155
+ # pytype static type analyzer
156
+ .pytype/
157
+
158
+ # Cython debug symbols
159
+ cython_debug/
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ #.idea/
Meissonic/InfinityStar/.gitignore ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.swp
2
+ **/__pycache__/**
3
+ **/.ipynb_checkpoints/**
4
+ .idea/*
5
+ llava/
6
+ _vis_cached/
7
+ _vqgan/
8
+ _vae/
9
+ _vae*/
10
+ ckpt/
11
+ log/
12
+ tb*/
13
+ img*/
14
+ local_output*
15
+ _auto_*
16
+ sd-vae-ft-mse/
17
+ stable-diffusion-v1-4/
18
+ *.pth
19
+ *.pth.tar
20
+ *.ckpt
21
+ *.log
22
+ *.ipynb
23
+ toscli
24
+ *.hydra
25
+ wandb
26
+ *.jpg
27
+ *.csv
28
+ *.tar.gz
29
+ *.bin
30
+ tmp
31
+ output
32
+ *.tsv
33
+ output/*
34
+ results/
35
+ *.JPEG
36
+ debug/
37
+ weights
38
+ checkpoints
39
+ ref.py
40
+ wandb
41
+ .DS_Store
42
+ ref.sh
43
+ ref.py
44
+ checkpoints_bk
45
+ *.avi
46
+ infinity/VideoVAE
47
+ saves/
48
+ tmp.sh
49
+ ref_*.sh
50
+ tmpp.sh
51
+ ref2.sh
52
+ checkpoints_new
53
+ checkpoints_*
54
+ tmp_images
55
+ tmp_videos
56
+ shm
57
+ wget-log
58
+ data/interactive_toy_data
59
+ tools/infer_interact_480p.py.bk
Meissonic/InfinityStar/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 FoundationVision
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Meissonic/InfinityStar/README.md ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="assets/logo.png" width="400" style="border:none;box-shadow:none;border-radius:0;background:none;">
3
+ <p>
4
+
5
+ # Infinity**⭐️**: Unified **S**pace**T**ime **A**uto**R**egressive Modeling for Visual Generation
6
+
7
+
8
+ <div align="center">
9
+
10
+ [![demo platform](https://img.shields.io/badge/Play%20with%20Infinity%21-Infinity%20demo%20platform-lightblue)](http://opensource.bytedance.com/discord/invite)&nbsp;
11
+ [![arXiv](https://img.shields.io/badge/arXiv%20paper-2511.04675-b31b1b.svg)](https://arxiv.org/abs/2511.04675)&nbsp;
12
+ [![huggingface weights](https://img.shields.io/badge/%F0%9F%A4%97%20Weights-FoundationVision/Infinity-yellow)](https://huggingface.co/FoundationVision/InfinityStar)&nbsp;
13
+
14
+ </div>
15
+ <p align="center" style="font-size: larger;">
16
+ <a href="http://arxiv.org/abs/2511.04675">Infinity⭐️: Unified Spacetime AutoRegressive Modeling for Visual Generation</a>
17
+ </p>
18
+
19
+ <!-- <p align="center">
20
+ <img src="assets/show_images.jpg" width=95%>
21
+ <p> -->
22
+
23
+ ---
24
+ ## 🔥 Updates!!
25
+ * Nov 7, 2025: 🔥 Paper, Training and Inference Codes && Checkpoints && Demo Website released!
26
+ * Sep 18, 2025: 🎉 InfinityStar is accepted as NeurIPS 2025 Oral.
27
+
28
+ ## 🕹️ Try and Play with Infinity⭐️!
29
+
30
+ We provide a [demo website](http://opensource.bytedance.com/discord/invite) for you to play with InfinityStar and generate videos. Enjoy the fun of bitwise video autoregressive modeling!
31
+
32
+ ## ✨ Overview
33
+ We introduce InfinityStar, a unified spacetime autoregressive framework for high-resolution image and dynamic video synthesis.
34
+
35
+ - 🧠 **Unified Spacetime Model**: A purely discrete, autoregressive approach that jointly captures spatial and temporal dependencies within a single, elegant architecture.
36
+
37
+ - 🎬 **Versatile Generation**: This unified design naturally supports a variety of generation tasks such as **text-to-image**, **text-to-video**, **image-to-video**, and **long interactive video synthesis** via straightforward temporal autoregression.
38
+
39
+ - 🏆 **Leading Performance & Speed**: Through extensive experiments, InfinityStar scores **83.74** on VBench, outperforming all autoregressive models by large margins, even surpassing diffusion competitors like HunyuanVideo, approximately **10x** faster than leading diffusion-based methods.
40
+
41
+ - 📖 **Pioneering High-Resolution Autoregressive Generation**: To our knowledge, InfinityStar is the first discrete autoregressive video generator capable of producing industrial-level 720p videos, setting a new standard for quality in its class.
42
+
43
+
44
+ ### 🔥 Unified modeling for image, video generation and long interactive video synthesis 📈:
45
+
46
+ <div align="left">
47
+ <img src="assets/framework.png" alt="" style="width: 100%;" />
48
+ </div>
49
+
50
+ ## 🎬 Video Demos
51
+ #### General Aesthetics
52
+ <div align="left">
53
+ <video src="https://github.com/user-attachments/assets/14e2b18b-9234-42ce-bdab-670faeef4b2a" width="100%" controls autoplay loop></video>
54
+ </div>
55
+
56
+ #### Anime & 3D Animation
57
+ <div align="left">
58
+ <video src="https://github.com/user-attachments/assets/478e9571-b550-4c23-a567-6fee9a0afb5b" width="100%" controls autoplay loop></video>
59
+ </div>
60
+
61
+ #### Motion
62
+ <div align="left">
63
+ <video src="https://github.com/user-attachments/assets/adab669b-d38f-4607-9a52-32d8d0bf0e53" width="100%" controls autoplay loop></video>
64
+ </div>
65
+
66
+ #### Extended Application: Long Interactive Videos
67
+ <div align="center">
68
+ <video src="https://github.com/user-attachments/assets/411666a6-563d-4551-a3f8-dc5de00436c1" width="100%" controls autoplay loop></video>
69
+ </div>
70
+
71
+ ## Benchmark
72
+
73
+ ### Achieve sota performance on image generation benchmark:
74
+
75
+ <div align="left">
76
+ <img src="assets/Infinitystar_image_gen_benchmark.png" alt="Image Generation Evaluation" style="width: 100%;" />
77
+ </div>
78
+
79
+ ### Achieve sota performance on video generation benchmark:
80
+
81
+ <div align="left">
82
+ <img src="assets/Infinitystar_videogen_benchmark.png" alt="" style="width: 100%;" />
83
+ </div>
84
+
85
+ ### Surpassing diffusion competitors like HunyuanVideo*:
86
+
87
+ <div align="left">
88
+ <img src="assets/Infinitystar_videogen_humaneval.png" alt="" style="width: 100%;" />
89
+ </div>
90
+
91
+
92
+ ## Visualization
93
+
94
+ ### Text to image examples
95
+
96
+ <div align="left">
97
+ <img src="assets/supp_show_images.png" alt="Text to Image Examples" style="width: 100%;" />
98
+ </div>
99
+
100
+ ### Image to video examples
101
+
102
+ <div align="left">
103
+ <img src="assets/i2v_examples.png" alt="Image to Video Examples" style="width: 100%;" />
104
+ </div>
105
+
106
+ ### Video extrapolation examples
107
+
108
+ <div align="left">
109
+ <img src="assets/v2v_examples.png" alt="Video Extrapolation Examples" style="width: 100%;" />
110
+ </div>
111
+
112
+ ## 📑 Open-Source Plan
113
+ - [x] Training Code
114
+ - [x] Web Demo
115
+ - [x] InfinityStar Inference Code
116
+ - [x] InfinityStar Models Checkpoints
117
+ - [x] InfinityStar-Interact Inference Code
118
+ - [x] InfinityStar-Interact Checkpoints
119
+
120
+
121
+ ## Installation
122
+ 1. We use FlexAttention to speedup training, which requires `torch>=2.5.1`.
123
+ 2. Install other pip packages via `pip3 install -r requirements.txt`.
124
+
125
+
126
+ ## Training Scripts
127
+ We provide a comprehensive workflow for training and finetuning our model, covering data organization, feature extraction, and training scripts. For detailed instructions, please refer to `data/README.md`.
128
+
129
+ ## Inference
130
+ * **720p Video Generation:**
131
+ Use `tools/infer_video_720p.py` to generate 5-second videos at 720p resolution. Due to the high computational cost of training, our released 720p model is trained for 5-second video generation. This script also supports image-to-video generation by specifying an image path.
132
+ ```bash
133
+ python3 tools/infer_video_720p.py
134
+ ```
135
+
136
+ * **480p Variable-Length Video Generation:**
137
+ We also provide an intermediate checkpoint for 480p resolution, capable of generating videos of 5 and 10 seconds. Since this model is not specifically optimized for Text-to-Video (T2V), we recommend using the experimental Image-to-Video (I2V) and Video-to-Video (V2V) modes for better results. To specify the video duration, you can edit the `generation_duration` variable in `tools/infer_video_480p.py` to either 5 or 10. This script also supports image-to-video and video continuation by providing a path to an image or a video.
138
+ ```bash
139
+ python3 tools/infer_video_480p.py
140
+ ```
141
+
142
+ * **480p Long Interactive Video Generation:**
143
+ Use `tools/infer_interact_480p.py` to generate a long interactive video in 480p. This script supports interactive video generation. You can provide a reference video and multiple prompts. The model will generate a video interactively with your assistance.
144
+ ```bash
145
+ python3 tools/infer_interact_480p.py
146
+ ```
147
+
148
+ ## Citation
149
+ If our work assists your research, feel free to give us a star ⭐ or cite us using:
150
+
151
+ ```
152
+ @Article{VAR,
153
+ title={Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale Prediction},
154
+ author={Keyu Tian and Yi Jiang and Zehuan Yuan and Bingyue Peng and Liwei Wang},
155
+ year={2024},
156
+ eprint={2404.02905},
157
+ archivePrefix={arXiv},
158
+ primaryClass={cs.CV}
159
+ }
160
+ ```
161
+
162
+ ```
163
+ @misc{Infinity,
164
+ title={Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution Image Synthesis},
165
+ author={Jian Han and Jinlai Liu and Yi Jiang and Bin Yan and Yuqi Zhang and Zehuan Yuan and Bingyue Peng and Xiaobing Liu},
166
+ year={2024},
167
+ eprint={2412.04431},
168
+ archivePrefix={arXiv},
169
+ primaryClass={cs.CV},
170
+ url={https://arxiv.org/abs/2412.04431},
171
+ }
172
+ ```
173
+
174
+ ```
175
+ @misc{InfinityStar,
176
+ title={InfinityStar: Unified Spacetime AutoRegressive Modeling for Visual Generation},
177
+ author={Jinlai Liu and Jian Han and Bin Yan and Hui Wu and Fengda Zhu and Xing Wang and Yi Jiang and Bingyue Peng and Zehuan Yuan},
178
+ year={2025},
179
+ eprint={2511.04675},
180
+ archivePrefix={arXiv},
181
+ primaryClass={cs.CV},
182
+ url={https://arxiv.org/abs/2511.04675},
183
+ }
184
+ ```
185
+
186
+ ## License
187
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
Meissonic/InfinityStar/__pycache__/train.cpython-310.pyc ADDED
Binary file (15.6 kB). View file
 
Meissonic/InfinityStar/assets/Infinitystar_image_gen_benchmark.png ADDED

Git LFS Details

  • SHA256: 6d14e42b1cfac29f069e8f8ce36467bdfd74367bc7ac7cd850fe2b4865f9191d
  • Pointer size: 131 Bytes
  • Size of remote file: 490 kB
Meissonic/InfinityStar/assets/Infinitystar_videogen_benchmark.png ADDED

Git LFS Details

  • SHA256: b05f52485ee8d5a68519d42a3240873f8834ae0c67456c5158bb21a79389c39a
  • Pointer size: 131 Bytes
  • Size of remote file: 442 kB
Meissonic/InfinityStar/assets/Infinitystar_videogen_humaneval.png ADDED

Git LFS Details

  • SHA256: d031b401dde7b11c7491c0733e43ac9dcc09544c4c4398ee26915753ccf9be29
  • Pointer size: 131 Bytes
  • Size of remote file: 249 kB
Meissonic/InfinityStar/assets/framework.png ADDED

Git LFS Details

  • SHA256: 7c3fcec1d6d95698b18a2df7a27a0e8cc376d39836e1384d2d9168ff169e0721
  • Pointer size: 132 Bytes
  • Size of remote file: 3.79 MB
Meissonic/InfinityStar/assets/i2v_examples.png ADDED

Git LFS Details

  • SHA256: 2e7653bbb42541f39297e5008875bbf7a30b98dc6025e0b741f1e937c68e33e4
  • Pointer size: 132 Bytes
  • Size of remote file: 4.65 MB
Meissonic/InfinityStar/assets/logo.png ADDED

Git LFS Details

  • SHA256: b8d796b40078c0a10ddcbf8261cc8c66fe141f4fa183d92490cc820c74d3d980
  • Pointer size: 131 Bytes
  • Size of remote file: 536 kB
Meissonic/InfinityStar/assets/reference_image.webp ADDED

Git LFS Details

  • SHA256: 0964f3ac35717552547c7d2f40482124d51cfddc2ba6f56fa9427b2f8bd91156
  • Pointer size: 131 Bytes
  • Size of remote file: 199 kB
Meissonic/InfinityStar/assets/supp_show_images.png ADDED

Git LFS Details

  • SHA256: 6b384868e3fff6762658cc5c8505eba67c41c355a7dc5c2d0c7856fdb8ccf163
  • Pointer size: 133 Bytes
  • Size of remote file: 26.4 MB
Meissonic/InfinityStar/assets/v2v_examples.png ADDED

Git LFS Details

  • SHA256: 63c3c9c0db94e27f098c3ede52f83d64b66afe50eff768a2d1635cf4c8202fe0
  • Pointer size: 132 Bytes
  • Size of remote file: 4.96 MB
Meissonic/InfinityStar/cog.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for Cog ⚙️
2
+ # Reference: https://cog.run/yaml
3
+
4
+ build:
5
+ # set to true if your model requires a GPU
6
+ gpu: true
7
+
8
+ # a list of ubuntu apt packages to install
9
+ system_packages:
10
+ - "libgl1-mesa-glx"
11
+ - "libglib2.0-0"
12
+
13
+ # python version in the form '3.11' or '3.11.4'
14
+ python_version: "3.11"
15
+
16
+ # a list of packages in the format <package-name>==<version>
17
+ python_packages:
18
+ - torch
19
+ - transformers
20
+ - easydict
21
+ - typed-argument-parser
22
+ - seaborn
23
+ - kornia
24
+ - gputil
25
+ - colorama
26
+ - omegaconf
27
+ - pandas
28
+ - timm==0.9.6
29
+ - decord
30
+ - pytz
31
+ - pandas
32
+ - wandb
33
+ - colorama
34
+ - imageio
35
+ - einops
36
+ - openai
37
+ - httpx==0.20.0
38
+ - opencv-python
39
+ - ipython
40
+
41
+ # commands run after the environment is setup
42
+ run:
43
+ - pip install "pydantic<2.0"
44
+ - pip install -U flash-attn --no-build-isolation
45
+ - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
46
+ predict: "predict.py:Predictor"
Meissonic/InfinityStar/data/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preparing and Training with Video Metadata
2
+
3
+ This guide walks you through preparing your video metadata, splitting it for efficient training, and running the training scripts.
4
+
5
+ ## 1. Prepare Your Data in `.jsonl` Format
6
+
7
+ Your video metadata should be organized in JSON Lines (`.jsonl`) format, where each line is a valid JSON object representing one video.
8
+
9
+ **Example:**
10
+ ```json
11
+ {
12
+ "video_path": "data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4",
13
+ "begin_frame_id": 0,
14
+ "end_frame_id": 120,
15
+ "tarsier2_caption": "The video features an animated character with long light orange hair and brown eyes.",
16
+ "width": 1280,
17
+ "height": 720,
18
+ "h_div_w": 0.5625,
19
+ "fps": 24
20
+ }
21
+ ```
22
+
23
+ ## 2. Split Metadata for Training
24
+
25
+ For efficient training, large `.jsonl` files can be split into smaller chunks.
26
+
27
+ ```bash
28
+ python3 data/infinitystar_toy_data/split_jsonls_for_training.py --jsonl_folder_list JSONL_DIR --save_dir SAVE_DIR --chunk_size 100
29
+ ```
30
+
31
+ ## 3. Extract Video Features
32
+
33
+ To extract video features, modify the `extract_video_features.sh` script. Set the `video_data_path` and choose the desired resolution.
34
+
35
+ * **480p (5s):** `pn=0.40M`
36
+ * **480p (10s):** `pn=0.40M` with `video_frames=161`
37
+ * **720p (5s):** `pn=0.90M`
38
+
39
+ Then, run the script:
40
+ ```bash
41
+ bash scripts/extract_video_features.sh
42
+ ```
43
+
44
+ ## 4. Run Training Scripts
45
+
46
+ Once your metadata is prepared and features are extracted, you can start training.
47
+
48
+ **480p Training (5s or 10s):**
49
+ ```bash
50
+ bash scripts/train_480p.sh
51
+ ```
52
+
53
+ **720p Training (only 5s):**
54
+ ```bash
55
+ bash scripts/train_720p.sh
56
+ ```
57
+ The 480p configuration supports both 5-second and 10-second video training. For 10-second training, ensure that `video_frames` is set to `161` in `extract_video_features.sh` and `train_480p.sh`.
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0001_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0002_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0003_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0004_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0005_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0006_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0007_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0008_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0009_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls/000001/0010_0010_000000100.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/data/infinitystar_toy_data/split_jsonls_for_training.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 FoundationVision
2
+ # SPDX-License-Identifier: MIT
3
+ import os
4
+ import os.path as osp
5
+ import time
6
+ import itertools
7
+ import shutil
8
+ import glob
9
+ import argparse
10
+ import json
11
+
12
+ import tqdm
13
+ import numpy as np
14
+ import threading
15
+
16
+ def save_lines(lines, filename):
17
+ os.makedirs(osp.dirname(filename), exist_ok=True)
18
+ with open(filename, 'w') as f:
19
+ f.writelines(lines)
20
+ del lines
21
+
22
+ def get_part_jsonls(save_dir, total_line_number, ext='.jsonl', chunk_size=1000):
23
+ if osp.exists(save_dir):
24
+ shutil.rmtree(save_dir)
25
+ chunk_id2save_files = {}
26
+ missing = False
27
+ parts = int(np.ceil(total_line_number / chunk_size))
28
+ for chunk_id in range(1, parts+1):
29
+ if chunk_id == parts:
30
+ num_of_lines = total_line_number - chunk_size * (parts-1)
31
+ else:
32
+ num_of_lines = chunk_size
33
+ bucket = (chunk_id-1) // 1000 + 1
34
+ chunk_id2save_files[chunk_id] = osp.join(save_dir, f'{bucket:06d}', f'{chunk_id:04d}_{parts:04d}_{num_of_lines:09d}{ext}')
35
+ if not osp.exists(chunk_id2save_files[chunk_id]):
36
+ missing = True
37
+ return missing, chunk_id2save_files
38
+
39
+ def split_large_txt_files(all_lines, chunk_id2save_files):
40
+ thread_list = []
41
+ chunk_id = 1
42
+ total = len(all_lines)
43
+ pbar = tqdm.tqdm(total=total)
44
+ chunk = []
45
+ pbar = tqdm.tqdm(total=len(chunk_id2save_files))
46
+ for line in all_lines:
47
+ chunk.append(line)
48
+ cur_chunk_size = int(osp.splitext(osp.basename(chunk_id2save_files[chunk_id]))[0].split('_')[-1])
49
+ if len(chunk) >= cur_chunk_size:
50
+ pbar.update(1)
51
+ thread_list.append(threading.Thread(target=save_lines, args=(chunk, chunk_id2save_files[chunk_id])))
52
+ thread_list[-1].start()
53
+ chunk = []
54
+ chunk_id += 1
55
+ if len(chunk):
56
+ import ipdb; ipdb.set_trace()
57
+ assert not len(chunk)
58
+ for thread in thread_list:
59
+ thread.join()
60
+
61
+ from multiprocessing import Manager
62
+ lock = Manager().Lock()
63
+ def read_jsonl(jsonl_file):
64
+ with open(jsonl_file, 'r') as f:
65
+ lines = f.readlines()
66
+ global pbar
67
+ with lock:
68
+ pbar.update(1)
69
+ return lines
70
+
71
+ def read_jsonls(jsonl_files, worker):
72
+ global pbar
73
+ from multiprocessing.pool import ThreadPool
74
+ pbar = tqdm.tqdm(total=len(jsonl_files))
75
+ print(f'[Data Loading] Reading {len(jsonl_files)} meta files...')
76
+ all_lines = []
77
+ if len(jsonl_files) == 1:
78
+ try:
79
+ lines_num = int(osp.splitext(jsonl_files[0])[0].split('_')[-1])
80
+ except:
81
+ lines_num = 0
82
+ pbar = tqdm.tqdm(total=lines_num)
83
+ with open(jsonl_files[0], 'r') as f:
84
+ for line in f:
85
+ pbar.update(1)
86
+ all_lines.append(line)
87
+ else:
88
+ with ThreadPool(worker) as pool:
89
+ for img_metas in pool.starmap(read_jsonl, [(bin_file,) for bin_file in jsonl_files]):
90
+ all_lines.extend(img_metas)
91
+ np.random.shuffle(all_lines)
92
+ return all_lines
93
+
94
+ if __name__ == '__main__':
95
+ parser = argparse.ArgumentParser()
96
+ parser.add_argument('--jsonl_folder_list', type=str, default='', nargs='+', help='patha pathb pathc')
97
+ parser.add_argument('--save_dir', type=str, default='')
98
+ parser.add_argument('--chunk_size', type=int, default=1000)
99
+ parser.add_argument('--worker', type=int, default=128)
100
+ args = parser.parse_args()
101
+
102
+ global pbar
103
+ t1 = time.time()
104
+ jsonl_files = []
105
+ for item in args.jsonl_folder_list:
106
+ jsonl_files += glob.glob(osp.join(item, '*.jsonl'))
107
+ np.random.shuffle(jsonl_files)
108
+
109
+ pbar = tqdm.tqdm(total=len(jsonl_files))
110
+ lines = read_jsonls(jsonl_files, args.worker)
111
+ lines = lines * 1000
112
+ print(f'total {len(lines)} lines')
113
+ line_num = len(lines)
114
+ missing, chunk_id2save_files = get_part_jsonls(args.save_dir, line_num, chunk_size=args.chunk_size)
115
+
116
+ split_large_txt_files(lines, chunk_id2save_files)
117
+ t2 = time.time()
118
+ print(f'split takes {t2-t1}s')
Meissonic/InfinityStar/data/infinitystar_toy_data/videos/e06b8ca5dbc6.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2aba75b0f17a90f9a150bd331f36b64ad4ef5bd298c3cf09e6b77a005b70b8df
3
+ size 4908102
Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/0000_refine_720p.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b83cf64485c51f1cbbfdc1d627d7ce15de72ddce8c46de54adaf1231bf4a9313
3
+ size 8972169
Meissonic/InfinityStar/data/interactive_toy_videos/002a061bdbc110ca8fb48e7e0a663c94/prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ The office is tidy, with a large desk covered in papers, a laptop, and a cup of coffee. A man in a white shirt sits at the desk, typing on the laptop keyboard. A desk lamp is turned on, casting light on the workspace.
2
+ The man stops typing and picks up the cup of coffee from the desk.
3
+ The man takes a sip from the coffee cup.
4
+ The man sets the coffee cup down and opens a notebook lying next to the laptop.
Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/0000_refine_720p.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1f64057df76d22164b3104c50894f61927aff7897cc15b684ead3622f231937
3
+ size 15799597
Meissonic/InfinityStar/data/interactive_toy_videos/0037784abafa76e2b84fe746750a7988/prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ A young boy wearing a yellow t-shirt and denim shorts is in a backyard garden. A red ball, a blue watering can, and a green garden hose lie on the grass nearby. The boy is standing next to a flower bed filled with colorful flowers, holding the blue watering can in his right hand. The sun is shining brightly overhead.
2
+ The boy lifts the watering can and starts pouring water onto the flowers in the flower bed.
3
+ The boy sets the watering can down on the grass and picks up the red ball with both hands.
4
+ The boy throws the red ball forward into the garden while standing near the flower bed.
Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/0000_refine_720p.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd141840a07bb0b06f87df221eec2b705417a8aad13622f7b2298cf91d2eb2c7
3
+ size 10632210
Meissonic/InfinityStar/data/interactive_toy_videos/00a79efb495c29e082c246e9ca9a7e8f/prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ A young woman dressed in a light gray hoodie and black leggings is sitting on a wooden bench in a city park. Around her, there are green trees, a paved walking path, and a metal water bottle placed on the bench beside her. She is holding a closed book in her lap and looking ahead thoughtfully. The sky is clear with soft afternoon sunlight filtering through the leaves.
2
+ The woman opens the book and begins to read, her eyes scanning the pages.
3
+ The woman lifts the metal water bottle and takes a sip from it.
4
+ The woman closes the book and looks up, observing the park surroundings.
Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/0000_refine_720p.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53e8ebcffd4dd3a6a92acb4f7c836ec51ff874258d3c43b2aa56387b06c4384
3
+ size 13742016
Meissonic/InfinityStar/data/interactive_toy_videos/011341aa8bea615b76d69423862f5f31/prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ The garden is filled with blooming flowers and a wooden bench near a stone path. A watering can and a pair of gardening gloves rest on the bench. A woman in a light green dress stands by the bench, holding a small potted plant with soil visible in the pot. She looks at the plant attentively.
2
+ The woman places the potted plant on the bench next to the watering can.
3
+ The woman picks up the watering can from the bench and lifts it.
4
+ The woman waters the flowers along the stone path using the watering can.
Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/0000_refine_720p.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933e9fd3b2dfe640dc193357576dd8f7f894cdde1e2e9f7eba753de09a5a1ef7
3
+ size 12703988
Meissonic/InfinityStar/data/interactive_toy_videos/012900f0605d2e4777119aeefaa7f31b/prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ A young boy wearing a yellow t-shirt and denim shorts stands in a park next to a wooden bench. On the bench lies a red soccer ball and a blue backpack. Trees with green leaves surround the area, and sunlight filters through the branches. The boy looks at the soccer ball while holding the straps of his backpack.
2
+ The boy bends down and picks up the red soccer ball from the bench.
3
+ The boy holds the soccer ball with both hands and begins to bounce it on the ground.
4
+ The boy kicks the soccer ball forward, sending it rolling across the grass.
Meissonic/InfinityStar/evaluation/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Overview
2
+ To facilitate reproducibility and evaluation, we provide the rewritten prompts used in our VBench evaluations. After generating videos with our inference script, you can evaluate their performance using the scoring tools available at [VBench](https://github.com/Vchitect/VBench).
Meissonic/InfinityStar/evaluation/VBench_rewrited_prompt.json ADDED
The diff for this file is too large to render. See raw diff
 
Meissonic/InfinityStar/infinity/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (c) 2025 FoundationVision
2
+ # SPDX-License-Identifier: MIT
Meissonic/InfinityStar/infinity/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (141 Bytes). View file
 
Meissonic/InfinityStar/infinity/dataset/__pycache__/build.cpython-310.pyc ADDED
Binary file (7.53 kB). View file
 
Meissonic/InfinityStar/infinity/dataset/__pycache__/dataset_joint_vi.cpython-310.pyc ADDED
Binary file (21.8 kB). View file
 
Meissonic/InfinityStar/infinity/dataset/build.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 FoundationVision
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ import datetime
5
+ import os
6
+ import os.path as osp
7
+ import random
8
+ import subprocess
9
+ from functools import partial
10
+ from typing import Optional
11
+ import time
12
+
13
+ import pytz
14
+
15
+ from infinity.dataset.dataset_joint_vi import JointViIterableDataset
16
+ from infinity.utils.sequence_parallel import SequenceParallelManager as sp_manager
17
+
18
+ try:
19
+ from grp import getgrgid
20
+ from pwd import getpwuid
21
+ except:
22
+ pass
23
+ import PIL.Image as PImage
24
+ from PIL import ImageFile
25
+ import numpy as np
26
+ from torchvision.transforms import transforms
27
+ from torchvision.transforms.functional import resize, to_tensor
28
+ import torch.distributed as tdist
29
+
30
+ from torchvision.transforms import InterpolationMode
31
+ bicubic = InterpolationMode.BICUBIC
32
+ lanczos = InterpolationMode.LANCZOS
33
+ PImage.MAX_IMAGE_PIXELS = (1024 * 1024 * 1024 // 4 // 3) * 5
34
+ ImageFile.LOAD_TRUNCATED_IMAGES = False
35
+
36
+
37
+ def time_str(fmt='[%m-%d %H:%M:%S]'):
38
+ return datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime(fmt)
39
+
40
+
41
+ def normalize_01_into_pm1(x): # normalize x from [0, 1] to [-1, 1] by (x*2) - 1
42
+ return x.add(x).add_(-1)
43
+
44
+
45
+ def denormalize_pm1_into_01(x): # denormalize x from [-1, 1] to [0, 1]
46
+ return x.add(1).mul_(0.5)
47
+
48
+
49
+ def center_crop_arr(pil_image, image_size):
50
+ """
51
+ Center cropping implementation from ADM.
52
+ https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
53
+ """
54
+ while min(*pil_image.size) >= 2 * image_size:
55
+ pil_image = pil_image.resize(
56
+ tuple(x // 2 for x in pil_image.size), resample=PImage.BOX
57
+ )
58
+
59
+ scale = image_size / min(*pil_image.size)
60
+ pil_image = pil_image.resize(
61
+ tuple(round(x * scale) for x in pil_image.size), resample=PImage.LANCZOS
62
+ )
63
+
64
+ arr = np.array(pil_image)
65
+ crop_y = (arr.shape[0] - image_size) // 2
66
+ crop_x = (arr.shape[1] - image_size) // 2
67
+ return PImage.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
68
+
69
+
70
+ class RandomResize:
71
+ def __init__(self, mid_reso, final_reso, interpolation):
72
+ ub = max(round((mid_reso + (mid_reso-final_reso) / 8) / 4) * 4, mid_reso)
73
+ self.reso_lb, self.reso_ub = final_reso, ub
74
+ self.interpolation = interpolation
75
+
76
+ def __call__(self, img):
77
+ return resize(img, size=random.randint(self.reso_lb, self.reso_ub), interpolation=self.interpolation)
78
+
79
+ def __repr__(self):
80
+ return f'RandomResize(reso=({self.reso_lb}, {self.reso_ub}), interpolation={self.interpolation})'
81
+
82
+
83
+ def load_save(reso=512):
84
+ import os
85
+ from PIL import Image as PImage
86
+ from torchvision.transforms import transforms, InterpolationMode
87
+ aug = transforms.Compose([
88
+ transforms.Resize(512, interpolation=InterpolationMode.LANCZOS),
89
+ transforms.CenterCrop((512, 512))
90
+ ])
91
+ src_folder = r'C:\Users\16333\Pictures\imgs_to_visual_v2'
92
+ ls = [os.path.join(src_folder, x) for x in ('1.jpg', '2.jpg', '3.png', '4.png', '5.png')]
93
+ print(ls)
94
+ imgs = []
95
+ for i, fname in enumerate(ls):
96
+ assert os.path.exists(fname)
97
+ with PImage.open(fname) as img:
98
+ img = img.convert('RGB')
99
+ img = aug(img)
100
+ imgs.append(img)
101
+ dst_d, dst_f = os.path.split(fname)
102
+ dst = os.path.join(dst_d, f'crop{dst_f.replace(".jpg", ".png")}')
103
+ img.save(dst)
104
+
105
+ W, H = imgs[0].size
106
+ WW = W * len(imgs)
107
+ new_im = PImage.new('RGB', (WW, H))
108
+ x_offset = 0
109
+ for img in imgs:
110
+ new_im.paste(img, (x_offset, 0))
111
+ x_offset += W
112
+ dst = os.path.join(src_folder, f'junfeng.png')
113
+ new_im.save(dst)
114
+
115
+
116
+ def print_aug(transform, label):
117
+ print(f'Transform {label} = ')
118
+ if hasattr(transform, 'transforms'):
119
+ for t in transform.transforms:
120
+ print(t)
121
+ else:
122
+ print(transform)
123
+ print('---------------------------\n')
124
+
125
+
126
+ def build_t2i_dataset(
127
+ args,
128
+ data_path: str,
129
+ max_caption_len: int,
130
+ short_prob=0.2,
131
+ load_vae_instead_of_image=False
132
+ ):
133
+ if args.use_streaming_dataset:
134
+ return T2IIterableDataset(
135
+ data_path,
136
+ max_caption_len=max_caption_len,
137
+ short_prob=short_prob,
138
+ load_vae_instead_of_image=load_vae_instead_of_image,
139
+ buffersize=args.iterable_data_buffersize,
140
+ pn=args.pn,
141
+ online_t5=args.online_t5,
142
+ batch_size=args.batch_size,
143
+ num_replicas=sp_manager.get_sp_group_nums() if sp_manager.sp_on() else tdist.get_world_size(), # 1,
144
+ rank = sp_manager.get_sp_group_rank() if sp_manager.sp_on() else tdist.get_rank(),
145
+ dataloader_workers=args.workers,
146
+ dynamic_resolution_across_gpus=args.dynamic_resolution_across_gpus,
147
+ enable_dynamic_length_prompt=args.enable_dynamic_length_prompt,
148
+ seed=args.seed,
149
+ dynamic_scale_schedule=args.dynamic_scale_schedule,
150
+ )
151
+ else:
152
+ raise ValueError(f'args.use_streaming_dataset={args.use_streaming_dataset} unsupported')
153
+
154
+
155
+ def build_joint_dataset(
156
+ args,
157
+ image_data_path: str,
158
+ video_data_path: str,
159
+ max_caption_len: int,
160
+ short_prob=0.2,
161
+ load_vae_instead_of_image=False
162
+ ):
163
+ if args.use_streaming_dataset:
164
+ return JointViIterableDataset(
165
+ image_meta_folder=image_data_path,
166
+ video_meta_folder=video_data_path,
167
+ max_caption_len=max_caption_len,
168
+ short_prob=short_prob,
169
+ load_vae_instead_of_image=load_vae_instead_of_image,
170
+ buffersize=args.iterable_data_buffersize,
171
+ pn=args.pn,
172
+ video_fps=args.video_fps,
173
+ num_frames=args.video_frames,
174
+ online_t5=args.online_t5,
175
+ num_replicas=sp_manager.get_sp_group_nums() if sp_manager.sp_on() else tdist.get_world_size(), # 1,
176
+ rank = sp_manager.get_sp_group_rank() if sp_manager.sp_on() else tdist.get_rank(),
177
+ dataloader_workers=args.workers,
178
+ dynamic_resolution_across_gpus=args.dynamic_resolution_across_gpus,
179
+ enable_dynamic_length_prompt=args.enable_dynamic_length_prompt,
180
+ dynamic_scale_schedule=args.dynamic_scale_schedule,
181
+ add_motion_score2caption=args.add_motion_score2caption,
182
+ seed=args.seed,
183
+ other_args=args,
184
+ )
185
+ else:
186
+ raise ValueError(f'args.use_streaming_dataset={args.use_streaming_dataset} unsupported')
187
+
188
+ def pil_load(path: str, proposal_size):
189
+ with open(path, 'rb') as f:
190
+ img: PImage.Image = PImage.open(f)
191
+ w: int = img.width
192
+ h: int = img.height
193
+ sh: int = min(h, w)
194
+ if sh > proposal_size:
195
+ ratio: float = proposal_size / sh
196
+ w = round(ratio * w)
197
+ h = round(ratio * h)
198
+ img.draft('RGB', (w, h))
199
+ img = img.convert('RGB')
200
+ return img
201
+
202
+
203
+ def rewrite(im: PImage, file: str, info: str):
204
+ kw = dict(quality=100)
205
+ if file.lower().endswith('.tif') or file.lower().endswith('.tiff'):
206
+ kw['compression'] = 'none'
207
+ elif file.lower().endswith('.webp'):
208
+ kw['lossless'] = True
209
+
210
+ st = os.stat(file)
211
+ uname = getpwuid(st.st_uid).pw_name
212
+ gname = getgrgid(st.st_gid).gr_name
213
+ mode = oct(st.st_mode)[-3:]
214
+
215
+ local_file = osp.basename(file)
216
+ im.save(local_file, **kw)
217
+ print(f'************* <REWRITE: {info}> ************* @ {file}')
218
+ subprocess.call(f'sudo mv {local_file} {file}; sudo chown {uname}:{gname} {file}; sudo chmod {mode} {file}', shell=True)
Meissonic/InfinityStar/infinity/dataset/dataset_joint_vi.py ADDED
@@ -0,0 +1,689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 FoundationVision
2
+ # SPDX-License-Identifier: MIT
3
+ import glob
4
+ import os
5
+ import time
6
+ from os import path as osp
7
+ from typing import List, Tuple
8
+ import json
9
+ import hashlib
10
+ import copy
11
+ import collections
12
+
13
+ import tqdm
14
+ import numpy as np
15
+ import torch
16
+ import pandas as pd
17
+ from decord import VideoReader
18
+ from PIL import Image as PImage
19
+ from torchvision.transforms.functional import to_tensor
20
+ from torch.utils.data import IterableDataset, DataLoader
21
+ import torch.distributed as tdist
22
+ from PIL import Image
23
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
+
25
+ from infinity.schedules.dynamic_resolution import get_dynamic_resolution_meta
26
+ from infinity.utils.video_decoder import EncodedVideoDecord, EncodedVideoOpencv
27
+ from transformers import AutoTokenizer
28
+
29
+ def transform(pil_img, tgt_h, tgt_w):
30
+ width, height = pil_img.size
31
+ if width / height <= tgt_w / tgt_h:
32
+ resized_width = tgt_w
33
+ resized_height = int(tgt_w / (width / height))
34
+ else:
35
+ resized_height = tgt_h
36
+ resized_width = int((width / height) * tgt_h)
37
+ pil_img = pil_img.resize((resized_width, resized_height), resample=PImage.LANCZOS)
38
+ # crop the center out
39
+ arr = np.array(pil_img)
40
+ crop_y = (arr.shape[0] - tgt_h) // 2
41
+ crop_x = (arr.shape[1] - tgt_w) // 2
42
+ im = to_tensor(arr[crop_y: crop_y + tgt_h, crop_x: crop_x + tgt_w])
43
+ # print(f'im size {im.shape}')
44
+ return im.add(im).add_(-1)
45
+
46
+ def get_prompt_id(prompt):
47
+ md5 = hashlib.md5()
48
+ md5.update(prompt.encode('utf-8'))
49
+ prompt_id = md5.hexdigest()
50
+ return prompt_id
51
+
52
+ def prepend_motion_score(prompt, motion_score):
53
+ return f'<<<motion_score: {round(motion_score):.1f}>>> {prompt}'
54
+
55
+ class VideoReaderWrapper(VideoReader):
56
+ def __init__(self, *args, **kwargs):
57
+ super().__init__(*args, **kwargs)
58
+ self.seek(0)
59
+ def __getitem__(self, key):
60
+ frames = super().__getitem__(key)
61
+ self.seek(0)
62
+ return frames
63
+
64
+
65
+ class JointViIterableDataset(IterableDataset):
66
+ def __init__(
67
+ self,
68
+ video_meta_folder: str = '',
69
+ buffersize: int = 1000000 * 300,
70
+ seed: int = 0,
71
+ pn: str = '',
72
+ video_fps: int = 1,
73
+ num_replicas: int = 1,
74
+ rank: int = 0,
75
+ dataloader_workers: int = 2,
76
+ dynamic_resolution_across_gpus: bool = True,
77
+ enable_dynamic_length_prompt: bool = True,
78
+ shuffle: bool = True,
79
+ short_prob: float = 0.2,
80
+ verbose=False,
81
+ temp_dir= "/dev/shm",
82
+ add_motion_score2caption=False,
83
+ other_args=None,
84
+ **kwargs,
85
+ ):
86
+ self.video_meta_folder = video_meta_folder
87
+ self.pn = pn
88
+ self.verbose = verbose
89
+ self.buffer_size = buffersize
90
+ self.num_replicas = num_replicas
91
+ self.rank = rank
92
+ self.worker_id = 0
93
+ self.global_worker_id = 0
94
+ self.short_prob = short_prob
95
+ self.dataloader_workers = max(1, dataloader_workers)
96
+ self.shuffle = shuffle
97
+ self.global_workers = self.num_replicas * self.dataloader_workers
98
+ self.add_motion_score2caption = add_motion_score2caption
99
+ self.seed = seed
100
+ self.text_tokenizer = other_args.text_tokenizer
101
+ self.feature_extraction = other_args.cache_check_mode < 0 # no sequence packing, for feature extraction
102
+ self.epoch_generator = None
103
+ self.epoch_worker_generator = None
104
+ self.epoch_global_worker_generator = None
105
+ self.epoch_rank_generator = None
106
+ self.other_args = other_args
107
+ self.drop_long_video = other_args.drop_long_video
108
+ self.dynamic_resolution_across_gpus = dynamic_resolution_across_gpus
109
+ self.enable_dynamic_length_prompt = enable_dynamic_length_prompt
110
+ self.set_epoch(other_args.epoch)
111
+ self.temporal_compress_rate = other_args.temporal_compress_rate
112
+ self.dynamic_resolution_h_w, self.h_div_w_templates = get_dynamic_resolution_meta(other_args.dynamic_scale_schedule, other_args.video_frames) # here video_frames is the max video frames
113
+ self.train_h_div_w_list = self.h_div_w_templates
114
+ self.video_fps = video_fps
115
+ self.min_training_duration = (other_args.min_video_frames-1) // self.video_fps
116
+ self.max_training_duration = (other_args.video_frames-1) // self.video_fps
117
+ self.append_duration2caption = other_args.append_duration2caption
118
+ print(f"{self.rank=} dataset {self.seed=}, {self.append_duration2caption=} add_motion_score2caption={add_motion_score2caption}, {self.min_training_duration=} {self.max_training_duration=}, cache_check_mode={self.other_args.cache_check_mode}")
119
+ self.token_cache_dir = other_args.token_cache_dir
120
+ self.use_vae_token_cache = other_args.use_vae_token_cache
121
+ self.allow_online_vae_feature_extraction = other_args.allow_online_vae_feature_extraction
122
+ self.use_text_token_cache = other_args.use_text_token_cache
123
+ self.max_video_frames = other_args.video_frames
124
+ self.cached_video_frames = other_args.cached_video_frames # cached max video frames
125
+ self.image_batches_multiply = other_args.image_batches_multiply
126
+ self.down_size_limit = other_args.down_size_limit
127
+ self.addition_pn_list = json.loads(other_args.addition_pn_list)
128
+ self.video_caption_type = other_args.video_caption_type
129
+ self.train_max_token_len = other_args.train_max_token_len
130
+ self.duration_resolution = other_args.duration_resolution
131
+ self.append_duration2caption = other_args.append_duration2caption
132
+ self.device = other_args.device
133
+ print(f'self.down_size_limit: {self.down_size_limit}')
134
+ self.max_text_len = other_args.tlen
135
+ self.temp_dir = temp_dir.rstrip("/")
136
+ self.metas = self.get_meta()
137
+ self.batches, self.batch_nums = self.form_batches(self.metas)
138
+ print(f'{num_replicas=}, {rank=}, {dataloader_workers=}, {self.batch_nums=}, {self.drop_long_video=} {self.max_text_len=}')
139
+
140
+ def append_duration_info(self, meta, mapped_duration):
141
+ meta['caption'] = f'<<<t={mapped_duration}s>>>' + meta['caption']
142
+ return meta
143
+
144
+ def get_captions_lens(self, captions):
145
+ if self.other_args.text_tokenizer_type == 'flan_t5':
146
+ tokens = self.other_args.text_tokenizer(text=captions, max_length=self.other_args.text_tokenizer.model_max_length, padding='max_length', truncation=True, return_tensors='pt')
147
+ mask = tokens.attention_mask.cuda(non_blocking=True)
148
+ lens: List[int] = mask.sum(dim=-1).tolist()
149
+ else: # umt5-xxl
150
+ ids, mask = self.other_args.text_tokenizer( captions, return_mask=True, add_special_tokens=True)
151
+ lens = mask.gt(0).sum(dim=1).tolist()
152
+ return lens
153
+
154
+ def get_meta(self):
155
+ part_filepaths = sorted(glob.glob(osp.join(self.video_meta_folder, '*/*.jsonl')))
156
+ self.epoch_generator.shuffle(part_filepaths)
157
+ print(f'jsonls sample: {part_filepaths[:4]}')
158
+ if self.num_replicas > 1:
159
+ part_filepaths = part_filepaths[self.rank::self.num_replicas]
160
+
161
+ metas = []
162
+ pbar = tqdm.tqdm(total=len(part_filepaths))
163
+ mapped_duration2freqs = collections.defaultdict(int)
164
+ total, corrupt = 0, 0
165
+ stop_read = False
166
+ rough_h_div_w = self.h_div_w_templates[np.argmin(np.abs((9/16-self.h_div_w_templates)))]
167
+ for part_filepath in part_filepaths:
168
+ if stop_read:
169
+ break
170
+ pbar.update(1)
171
+ with open(part_filepath, 'r', encoding='utf-8') as f:
172
+ lines = f.readlines()
173
+ for line in lines:
174
+ total += 1
175
+ try:
176
+ meta = json.loads(line)
177
+ except Exception as e:
178
+ print(e)
179
+ corrupt += 1
180
+ print(e, corrupt, total, corrupt/total)
181
+ continue
182
+ if 'h_div_w' in meta:
183
+ del meta['h_div_w']
184
+ if 'video_path' in meta:
185
+ begin_frame_id, end_frame_id, fps = meta['begin_frame_id'], meta['end_frame_id'], meta['fps']
186
+ real_duration = (end_frame_id - begin_frame_id) / fps
187
+ mapped_duration = int(real_duration / self.duration_resolution) * self.duration_resolution
188
+ if mapped_duration < self.min_training_duration:
189
+ continue
190
+ if mapped_duration > self.max_training_duration:
191
+ if self.drop_long_video:
192
+ continue
193
+ else:
194
+ mapped_duration = self.max_training_duration
195
+ caption_type = 'tarsier2_caption'
196
+ if ('MiniCPM_V_2_6_caption' in meta) and meta['MiniCPM_V_2_6_caption']:
197
+ caption_type = self.epoch_rank_generator.choice(['tarsier2_caption', 'MiniCPM_V_2_6_caption'])
198
+ meta['caption'] = meta[caption_type]
199
+ if self.enable_dynamic_length_prompt and (self.epoch_rank_generator.random() < self.short_prob):
200
+ meta['caption'] = self.random_drop_sentences(meta['caption'])
201
+ if 'quality_prompt' in meta:
202
+ meta['caption'] = meta['caption'] + ' ' + meta['quality_prompt']
203
+ if self.append_duration2caption:
204
+ meta = self.append_duration_info(meta, mapped_duration)
205
+ assert meta['caption']
206
+ sample_frames = int(mapped_duration * self.video_fps + 1)
207
+ pt = (sample_frames-1) // self.temporal_compress_rate + 1
208
+ scale_schedule = self.dynamic_resolution_h_w[rough_h_div_w][self.pn]['pt2scale_schedule'][pt]
209
+ meta['sample_frames'] = sample_frames
210
+ elif 'image_path' in meta:
211
+ mapped_duration = -1
212
+ scale_schedule = self.dynamic_resolution_h_w[rough_h_div_w][self.pn]['pt2scale_schedule'][1]
213
+ if not meta['text']:
214
+ meta['caption'] = meta['long_caption']
215
+ elif not meta['long_caption']:
216
+ meta['caption'] = meta['text']
217
+ else:
218
+ if self.epoch_rank_generator.random() < self.other_args.short_cap_prob:
219
+ meta['caption'] = meta['text']
220
+ else:
221
+ meta['caption'] = meta['long_caption']
222
+ if self.enable_dynamic_length_prompt and (self.epoch_rank_generator.random() < self.short_prob):
223
+ meta['caption'] = self.random_drop_sentences(meta['caption'])
224
+ else:
225
+ raise ValueError(f'video_path or image_path not exist in meta: {meta}')
226
+
227
+ cum_visual_tokens = np.array(scale_schedule).prod(-1).cumsum()
228
+ meta['cum_text_visual_tokens'] = cum_visual_tokens
229
+ if self.other_args.cache_check_mode == 1: # check at the begining
230
+ if self.exists_cache_file(meta):
231
+ metas.append(meta)
232
+ elif self.other_args.cache_check_mode == -1: # select unexist, used for token cache
233
+ if not self.exists_cache_file(meta):
234
+ metas.append(meta)
235
+ else:
236
+ metas.append(meta)
237
+ mapped_duration2freqs[mapped_duration] += 1
238
+ if (self.other_args.restrict_data_size > 0) and (len(metas) > self.other_args.restrict_data_size / self.num_replicas):
239
+ stop_read = True
240
+ break
241
+
242
+ # metas = sorted(metas, key=lambda x: -x['text_visual_tokens'])
243
+
244
+ # append text tokens
245
+ metas = self.append_text_tokens(metas)
246
+
247
+ self.epoch_rank_generator.shuffle(metas)
248
+ for mapped_duration in sorted(mapped_duration2freqs.keys()):
249
+ freq = mapped_duration2freqs[mapped_duration]
250
+ proportion = freq / len(metas) * 100
251
+ print(f'{mapped_duration=}, {freq=}, {proportion=:.1f}%')
252
+ return metas
253
+
254
+ def append_text_tokens(self, metas, bucket_size=100):
255
+ t1 = time.time()
256
+ max_text_visual_tokens = -1
257
+ pbar = tqdm.tqdm(total=len(metas) // bucket_size + 1, desc='append text tokens')
258
+ for bucket_id in range(len(metas) // bucket_size + 1):
259
+ pbar.update(1)
260
+ start = bucket_id * bucket_size
261
+ end = min(start + bucket_size, len(metas))
262
+ if start >= end:
263
+ break
264
+ if self.feature_extraction:
265
+ lens = [0 for i in range(start, end)]
266
+ else:
267
+ captions = [metas[i]['caption'] for i in range(start, end)]
268
+ assert len(captions), f'{len(captions)=}'
269
+ lens = self.get_captions_lens(captions)
270
+ for i in range(start, end):
271
+ metas[i]['text_tokens'] = min(self.max_text_len, lens[i-start])
272
+ metas[i]['cum_text_visual_tokens'] = metas[i]['cum_text_visual_tokens'] + metas[i]['text_tokens']
273
+ metas[i]['text_visual_tokens'] = metas[i]['cum_text_visual_tokens'][-1]
274
+ max_text_visual_tokens = max(max_text_visual_tokens, metas[i]['text_visual_tokens'])
275
+ if not self.other_args.allow_less_one_elem_in_seq:
276
+ assert max_text_visual_tokens <= self.train_max_token_len, f'{self.train_max_token_len=} should > {max_text_visual_tokens=}'
277
+ t2 = time.time()
278
+ print(f'append text tokens: {t2-t1:.1f}s')
279
+ return metas
280
+
281
+ def exists_cache_file(self, meta):
282
+ if 'image_path' in meta:
283
+ return osp.exists(self.get_image_cache_file(meta['image_path']))
284
+ else:
285
+ if '/vdataset/clip' in meta['video_path']: # clip
286
+ cache_file = self.get_video_cache_file(meta['video_path'], 0, meta['end_frame_id']-meta['begin_frame_id'], self.video_fps)
287
+ else:
288
+ cache_file = self.get_video_cache_file(meta['video_path'], meta['begin_frame_id'], meta['end_frame_id'], self.video_fps)
289
+ return osp.exists(cache_file)
290
+
291
+ def form_batches(self, metas):
292
+ st = time.time()
293
+ if self.feature_extraction: # no sequence packing, for feature extraction
294
+ batches = [[item] for item in range(len(metas))]
295
+ else:
296
+ batches = []
297
+ has_been_used = [False for _ in range(len(metas))]
298
+ bucket_size = min(len(metas), self.other_args.seq_pack_bucket)
299
+ print(f'[data preprocess] form_batches form {len(metas)} metas, bucket_size={bucket_size}...')
300
+ step = len(metas) // bucket_size + 1
301
+ for bucket_id in range(step):
302
+ left_ptr = bucket_id
303
+ while left_ptr < len(metas):
304
+ tmp_batch = [left_ptr]
305
+ tokens_remain = self.train_max_token_len - metas[left_ptr]['text_visual_tokens']
306
+ left_ptr += step
307
+ while (left_ptr < len(metas)) and (metas[left_ptr]['text_visual_tokens'] <= tokens_remain):
308
+ if not has_been_used[left_ptr]:
309
+ has_been_used[left_ptr] = True
310
+ tokens_remain -= metas[left_ptr]['text_visual_tokens']
311
+ tmp_batch.append(left_ptr)
312
+ left_ptr += step
313
+ tmp_ptr = left_ptr + step
314
+ while tmp_ptr < len(metas) and tokens_remain > 0:
315
+ if (not has_been_used[tmp_ptr]) and (metas[tmp_ptr]['text_visual_tokens'] <= tokens_remain):
316
+ has_been_used[tmp_ptr] = True
317
+ tokens_remain -= metas[tmp_ptr]['text_visual_tokens']
318
+ tmp_batch.append(tmp_ptr)
319
+ tmp_ptr += step
320
+
321
+ # 从text_tokens小于tokens_remain的数据中阶段选取序列填入,以提高利用率
322
+ if tokens_remain > 0:
323
+ increase_seq_usage_times = 0
324
+ while increase_seq_usage_times == 0 or (tokens_remain > self.max_text_len):
325
+ increase_seq_usage_times += 1
326
+ if increase_seq_usage_times >= 3: break
327
+ select_map = {}
328
+ for ind in tmp_batch:
329
+ select_map[ind] = True
330
+ candidates = []
331
+ min_val = 99999999
332
+ for tmp_ind in range(bucket_id, len(metas), step):
333
+ if (metas[tmp_ind]['cum_text_visual_tokens'][0] <= tokens_remain) and (tmp_ind not in select_map):
334
+ import bisect
335
+ idx = bisect.bisect_right(metas[tmp_ind]['cum_text_visual_tokens'], tokens_remain)
336
+ if tokens_remain - metas[tmp_ind]['cum_text_visual_tokens'][idx-1] < min_val:
337
+ min_val = tokens_remain - metas[tmp_ind]['cum_text_visual_tokens'][idx-1]
338
+ candidates = [tmp_ind]
339
+ elif tokens_remain - metas[tmp_ind]['cum_text_visual_tokens'][idx-1] == min_val:
340
+ candidates.append(tmp_ind)
341
+ if len(candidates):
342
+ tmp_batch.append(self.epoch_rank_generator.choice(candidates))
343
+ tokens_remain = min_val
344
+ else:
345
+ break
346
+ batches.append(tmp_batch)
347
+ if len(batches) % 1000 == 0:
348
+ print(f'form {len(batches)} batches, left_ptr={left_ptr}, len(metas)={len(metas)}')
349
+ batch_num = len(batches)
350
+ print(f'[data preprocess] form_batches done, got {len(batches)} batches, cost {time.time()-st:.2f}s')
351
+ try:
352
+ if self.num_replicas > 1:
353
+ batch_num = torch.tensor([batch_num], device=self.device)
354
+ if tdist.is_initialized():
355
+ tdist.all_reduce(batch_num, op=tdist.ReduceOp.MIN)
356
+ batch_num = batch_num.item()
357
+ except Exception as e:
358
+ print(e)
359
+ batch_num = batch_num // self.dataloader_workers * self.dataloader_workers
360
+ print(f'[data preprocess] form_batches done, got {batch_num} batches')
361
+ return batches, batch_num
362
+
363
+ def set_global_worker_id(self):
364
+ worker_info = torch.utils.data.get_worker_info()
365
+ if worker_info:
366
+ worker_total_num = worker_info.num_workers
367
+ worker_id = worker_info.id
368
+ else:
369
+ worker_id = 0
370
+ worker_total_num = 1
371
+ assert worker_total_num == self.dataloader_workers, print(worker_total_num, self.dataloader_workers)
372
+ self.worker_id = worker_id
373
+ self.global_worker_id = self.rank * self.dataloader_workers + worker_id
374
+
375
+ def set_epoch(self, epoch):
376
+ self.epoch = epoch
377
+ self.set_generator()
378
+
379
+ def set_generator(self, ):
380
+ self.epoch_generator = np.random.default_rng(self.seed + self.epoch)
381
+ self.epoch_worker_generator = np.random.default_rng(self.seed + self.epoch + self.worker_id)
382
+ self.epoch_global_worker_generator = np.random.default_rng(self.seed + self.epoch + self.global_worker_id)
383
+ self.epoch_rank_generator = np.random.default_rng(self.seed + self.epoch + self.rank)
384
+
385
+ def __iter__(self):
386
+ self.set_global_worker_id()
387
+ self.set_generator()
388
+ self.epoch_rank_generator.shuffle(self.batches)
389
+ yield_data_cnt = 0
390
+ batch_ind_ptr = self.worker_id
391
+ failed_batch_cnt = 0
392
+ last_yield_data_time = time.time()
393
+ while yield_data_cnt < self.batch_nums // self.dataloader_workers:
394
+ # if True:
395
+ try:
396
+ if time.time() - last_yield_data_time > 600:
397
+ raise ValueError(f'[dataset] it takes too long to yield data, please check your code')
398
+ batch_inds = self.batches[batch_ind_ptr%len(self.batches)]
399
+ if self.other_args.cache_check_mode in [-2, 2, 3]: # -2, 2 means check vae token cache at each iteration
400
+ all_has_been_cached = True
401
+ all_has_not_been_cached = True
402
+ for j in batch_inds:
403
+ exist_status = self.exists_cache_file(self.metas[j])
404
+ if exist_status:
405
+ all_has_not_been_cached = False
406
+ if not exist_status:
407
+ all_has_been_cached = False
408
+ if self.other_args.cache_check_mode == 2: # mush all example has been cached
409
+ if not all_has_been_cached:
410
+ batch_ind_ptr += self.dataloader_workers
411
+ continue
412
+ if self.other_args.cache_check_mode == -2: # must not all has been cached cached before
413
+ if all_has_been_cached:
414
+ batch_ind_ptr += self.dataloader_workers
415
+ # print(f"skipping batch_inds {batch_inds}")
416
+ continue
417
+ if self.other_args.cache_check_mode == 3: # at least one has been cached
418
+ if all_has_not_been_cached:
419
+ batch_ind_ptr += self.dataloader_workers
420
+ continue
421
+
422
+ batch_data = []
423
+ for j in batch_inds:
424
+ meta = self.metas[j]
425
+ if 'image_path' in meta:
426
+ ret, model_input = self.prepare_image_input(meta)
427
+ elif 'video_path' in meta:
428
+ ret, model_input = self.prepare_video_input(meta)
429
+ # if not ret: break
430
+ if ret:
431
+ batch_data.append(model_input)
432
+ if not len(batch_data):
433
+ batch_ind_ptr += self.dataloader_workers
434
+ continue
435
+ # raise ValueError(f'[dataset] prepare_video_input failed, continue, failed meta is {meta}')
436
+
437
+ captions4images, captions4raw_features, images, raw_features_bcthw, feature_cache_files4images, text_features = [], [], [], [], [], []
438
+ text_feature_cache_files = []
439
+ addition_pn_images = {}
440
+ for item in batch_data:
441
+ if item['raw_features_cthw'] is None:
442
+ images.append(item['img_T3HW'].permute(1,0,2,3)) # # tchw -> cthw
443
+ for key in item:
444
+ if key.startswith('img_T3HW_'):
445
+ if key not in addition_pn_images:
446
+ addition_pn_images[key] = []
447
+ addition_pn_images[key].append(item[key].permute(1,0,2,3))
448
+ feature_cache_files4images.append(item['feature_cache_file'])
449
+ captions4images.append(item['text_input'])
450
+ else:
451
+ raw_features_bcthw.append(item['raw_features_cthw'])
452
+ captions4raw_features.append(item['text_input'])
453
+ text_feature_cache_files.append(item['text_feature_cache_file'])
454
+ captions = captions4images + captions4raw_features
455
+ assert len(batch_data), f'len(batch_data)={len(batch_data)}'
456
+ text_cond_tuple = None
457
+ yield {
458
+ 'captions': captions,
459
+ 'images': images,
460
+ 'addition_pn_images': addition_pn_images,
461
+ 'feature_cache_files4images': feature_cache_files4images,
462
+ 'raw_features_bcthw': raw_features_bcthw,
463
+ 'text_cond_tuple': text_cond_tuple,
464
+ 'text_feature_cache_files': text_feature_cache_files,
465
+ 'media': 'videos',
466
+ }
467
+ yield_data_cnt += 1
468
+ batch_ind_ptr += self.dataloader_workers
469
+ del batch_data
470
+ del images
471
+ del captions
472
+ last_yield_data_time = time.time()
473
+ except Exception as e:
474
+ batch_ind_ptr += self.dataloader_workers
475
+ failed_batch_cnt += 1
476
+ if failed_batch_cnt % 400 == 0:
477
+ print(f'failed_batch_cnt: {failed_batch_cnt}, yield_data_cnt: {yield_data_cnt}')
478
+ print(f'[dataset] error: {e}')
479
+
480
+ def prepare_image_input(self, info) -> Tuple:
481
+ try:
482
+ img_path, text_input = osp.abspath(info['image_path']), info['caption']
483
+ img_T3HW, raw_features_cthw, feature_cache_file, text_features_lenxdim, text_feature_cache_file = [None] * 5
484
+ # text_input = process_short_text(text_input)
485
+ if self.use_text_token_cache:
486
+ text_feature_cache_file = osp.join(self.token_cache_dir, 'flan-t5-xl-official', get_prompt_id(text_input)+'.pt')
487
+ if osp.exists(text_feature_cache_file):
488
+ text_features_lenxdim = torch.load(text_feature_cache_file, weights_only=True)
489
+
490
+ if self.add_motion_score2caption:
491
+ rand_motion_score = -1 + self.epoch_rank_generator.random() * 21.0 # -1.0 ~ 20.0
492
+ text_input = prepend_motion_score(text_input, rand_motion_score)
493
+ if self.use_vae_token_cache:
494
+ feature_cache_file = self.get_image_cache_file(img_path)
495
+ if osp.exists(feature_cache_file):
496
+ try:
497
+ raw_features_cthw = torch.load(feature_cache_file, weights_only=True)
498
+ except Exception as e:
499
+ print(f'load cache file error: {e}')
500
+ os.remove(feature_cache_file)
501
+ if raw_features_cthw is None and (not self.allow_online_vae_feature_extraction):
502
+ return False, None
503
+ if raw_features_cthw is None:
504
+ with open(img_path, 'rb') as f:
505
+ img: PImage.Image = PImage.open(f)
506
+ w, h = img.size
507
+ h_div_w = h / w
508
+ h_div_w_template = self.h_div_w_templates[np.argmin(np.abs((h_div_w-self.h_div_w_templates)))]
509
+ tgt_h, tgt_w = self.dynamic_resolution_h_w[h_div_w_template][self.pn]['pixel']
510
+ img = img.convert('RGB')
511
+ img_T3HW = transform(img, tgt_h, tgt_w)
512
+ img_T3HW = img_T3HW.unsqueeze(0)
513
+ assert img_T3HW.shape[1] == 3
514
+ data_item = {
515
+ 'text_input': text_input,
516
+ 'img_T3HW': img_T3HW,
517
+ 'raw_features_cthw': raw_features_cthw,
518
+ 'feature_cache_file': feature_cache_file,
519
+ 'text_features_lenxdim': text_features_lenxdim,
520
+ 'text_feature_cache_file': text_feature_cache_file,
521
+ }
522
+ return True, data_item
523
+ except Exception as e:
524
+ print(f'prepare_image_input error: {e}')
525
+ return False, None
526
+
527
+ def prepare_pair_image_input(self, info) -> Tuple:
528
+ pass
529
+
530
+ def prepare_pair_video_input(self, info) -> Tuple:
531
+ tmp_info = copy.deepcopy(info)
532
+ tmp_info['video_path'] = info['win_video_path']
533
+ win_flag, win_data_item = self.prepare_video_input(tmp_info)
534
+ assert win_data_item['raw_features_cthw'] is None
535
+
536
+ tmp_info['video_path'] = info['lose_video_path']
537
+ lose_flag, lose_data_item = self.prepare_video_input(tmp_info)
538
+ assert lose_data_item['raw_features_cthw'] is None
539
+
540
+ flag = win_flag and lose_flag
541
+ img_T3HW = torch.stack([win_data_item['img_T3HW'], lose_data_item['img_T3HW']], dim=0) # [2,T,C,H,W]
542
+ win_data_item['img_T3HW'] = img_T3HW
543
+ return flag, win_data_item
544
+
545
+ def prepare_video_input(self, info) -> Tuple:
546
+ filename, begin_frame_id, end_frame_id = (
547
+ info["video_path"],
548
+ info["begin_frame_id"],
549
+ info["end_frame_id"],
550
+ )
551
+
552
+ if True:
553
+ # try:
554
+ img_T3HW, raw_features_cthw, feature_cache_file, text_features_lenxdim, text_feature_cache_file = None, None, None, None, None
555
+ img_T3HW_4additional_pn = {}
556
+ text_input = info['caption']
557
+ if '/vdataset/clip' in filename: # clip
558
+ begin_frame_id, end_frame_id = 0, end_frame_id - begin_frame_id
559
+ sample_frames = info['sample_frames']
560
+ if self.use_vae_token_cache:
561
+ feature_cache_file = self.get_video_cache_file(info["video_path"], begin_frame_id, end_frame_id, self.video_fps)
562
+ if osp.exists(feature_cache_file):
563
+ try:
564
+ pt = (sample_frames-1) // self.temporal_compress_rate + 1
565
+ raw_features_cthw = torch.load(feature_cache_file, weights_only=True)
566
+ # _, tgt_h, tgt_w = self.dynamic_resolution_h_w[h_div_w_template][self.pn]['pt2scale_schedule'][1][-1]
567
+ # assert raw_features_cthw.shape[-2:] == (tgt_h, tgt_w), f'raw_features_cthw.shape[-2:] == (tgt_h, tgt_w): {raw_features_cthw.shape[-2:]} vs {(tgt_h, tgt_w)}'
568
+ assert raw_features_cthw.shape[1] >= pt, f'raw_features_cthw.shape[1] >= pt: {raw_features_cthw.shape[1]} vs {pt}'
569
+ if raw_features_cthw.shape[1] > pt:
570
+ raw_features_cthw = raw_features_cthw[:,:pt]
571
+ except Exception as e:
572
+ print(f'load video cache file error: {e}')
573
+ os.remove(feature_cache_file)
574
+ raw_features_cthw = None
575
+ if raw_features_cthw is None and (not self.allow_online_vae_feature_extraction):
576
+ return False, None
577
+ pn_list = [self.pn]
578
+ if raw_features_cthw is None:
579
+ local_path = info["video_path"]
580
+ if not local_path: return False, None
581
+ if not osp.exists(local_path):
582
+ return False, None
583
+ video = EncodedVideoOpencv(local_path, os.path.basename(local_path), num_threads=0)
584
+ # video = EncodedVideoDecord(local_path, os.path.basename(local_path), num_threads=0)
585
+ start_interval = max(0, begin_frame_id / video._fps)
586
+ end_interval = start_interval+(sample_frames-1)/self.video_fps
587
+ assert end_interval <= video.duration + 0.2, f'{end_interval=}, but {video.duration=}' # 0.2s margin
588
+ end_interval = min(end_interval, video.duration)
589
+ raw_video, _ = video.get_clip(start_interval, end_interval, sample_frames)
590
+ h, w, _ = raw_video[0].shape
591
+ h_div_w = h / w
592
+ h_div_w_template = self.h_div_w_templates[np.argmin(np.abs((h_div_w-self.h_div_w_templates)))]
593
+ tgt_h, tgt_w = self.dynamic_resolution_h_w[h_div_w_template][self.pn]['pixel']
594
+
595
+ for addition_pn in self.addition_pn_list:
596
+ pn_list = pn_list + [addition_pn]
597
+ for pn in pn_list:
598
+ if isinstance(video, EncodedVideoDecord):
599
+ img_T3HW = [transform(Image.fromarray(frame).convert("RGB"), tgt_h, tgt_w) for frame in raw_video]
600
+ else:
601
+ img_T3HW = [transform(Image.fromarray(frame[:,:,::-1]), tgt_h, tgt_w) for frame in raw_video]
602
+ img_T3HW = torch.stack(img_T3HW, 0)
603
+ img_T3HW_4additional_pn[pn] = img_T3HW
604
+ del video
605
+ assert img_T3HW.shape[1] == 3
606
+ data_item = {
607
+ 'text_input': text_input,
608
+ 'img_T3HW': img_T3HW_4additional_pn.get(self.pn, None),
609
+ 'raw_features_cthw': raw_features_cthw,
610
+ 'feature_cache_file': feature_cache_file,
611
+ 'text_features_lenxdim': text_features_lenxdim,
612
+ 'text_feature_cache_file': text_feature_cache_file,
613
+ }
614
+ for pn in pn_list[1:]:
615
+ data_item.update({f'img_T3HW_{pn}': img_T3HW_4additional_pn.get(pn, None)})
616
+ return True, data_item
617
+ # except Exception as e:
618
+ # # print(f'prepare_video_input error: {e}, info: {info}')
619
+ # return False, None
620
+ # finally:
621
+ # try:
622
+ # if (img_T3HW is not None) and local_path and (local_path != filename):
623
+ # os.remove(local_path)
624
+ # except Exception as e:
625
+ # print(f'delete local_path: {local_path} error: {e}, info: {info}')
626
+
627
+ @staticmethod
628
+ def collate_function(batch, online_t5: bool = False) -> None:
629
+ pass
630
+
631
+ def random_drop_sentences(self, caption):
632
+ elems = [item for item in caption.split('.') if item]
633
+ if len(elems) < 2:
634
+ return caption
635
+ sentences = self.epoch_global_worker_generator.integers(1, len(elems)+1)
636
+ return '.'.join(elems[:sentences]) + '.'
637
+
638
+ def get_text_input(self, long_text_input, short_text_input, long_text_type):
639
+ assert long_text_input or short_text_input
640
+ if not long_text_input:
641
+ return short_text_input
642
+ if not short_text_input:
643
+ return long_text_input
644
+ random_value = self.epoch_global_worker_generator.random()
645
+ assert not self.enable_dynamic_length_prompt
646
+ if self.enable_dynamic_length_prompt and long_text_type != 'user_prompt':
647
+ long_text_elems = [item for item in long_text_input.split('.') if item]
648
+ if len(long_text_elems):
649
+ first_sentence_words = [item for item in long_text_elems[0].split(' ') if item]
650
+ else:
651
+ first_sentence_words = 0
652
+ if len(first_sentence_words) >= 15:
653
+ num_sentence4short_text = 1
654
+ else:
655
+ num_sentence4short_text = 2
656
+ if not short_text_input:
657
+ short_text_input = '.'.join(long_text_elems[:num_sentence4short_text])
658
+ if random_value < self.short_prob:
659
+ return short_text_input
660
+ if len(long_text_elems) <= num_sentence4short_text:
661
+ return long_text_input
662
+ select_sentence_num = self.epoch_global_worker_generator.integers(num_sentence4short_text+1, len(long_text_elems)+1)
663
+ return '.'.join(long_text_elems[:select_sentence_num])
664
+ else:
665
+ if random_value < self.short_prob:
666
+ return short_text_input
667
+ return long_text_input
668
+
669
+ def __len__(self):
670
+ return self.batch_nums
671
+
672
+ def get_image_cache_file(self, image_path):
673
+ elems = image_path.split('/')
674
+ elems = [item for item in elems if item]
675
+ filename, ext = osp.splitext(elems[-1])
676
+ filename = get_prompt_id(filename)
677
+ save_filepath = osp.join(self.token_cache_dir, f'images_pn_{self.pn}', '/'.join(elems[4:-1]), f'{filename}.pt')
678
+ return save_filepath
679
+
680
+ def get_video_cache_file(self, video_path, begin_frame_id, end_frame_id, video_fps):
681
+ elems = video_path.split('/')
682
+ elems = [item for item in elems if item]
683
+ filename, ext = osp.splitext(elems[-1])
684
+ filename = get_prompt_id(filename)
685
+ save_filepath = osp.join(self.token_cache_dir, f'pn_{self.pn}_sample_fps_{video_fps}', '/'.join(elems[4:-1]), f'{filename}_sf_{begin_frame_id}_ef_{end_frame_id}.pt')
686
+ return save_filepath
687
+
688
+ if __name__ == '__main__':
689
+ pass
Meissonic/InfinityStar/infinity/models/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 FoundationVision
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ import torch
5
+ from timm.loss import SoftTargetCrossEntropy
6
+
7
+ from timm.models.layers import DropPath
8
+
9
+ from .infinity import Infinity, sample_with_top_k_top_p_also_inplace_modifying_logits_
10
+
11
+ def _ex_repr(self):
12
+ return ', '.join(
13
+ f'{k}=' + (f'{v:g}' if isinstance(v, float) else str(v))
14
+ for k, v in vars(self).items()
15
+ if not k.startswith('_') and k != 'training'
16
+ and not isinstance(v, (torch.nn.Module, torch.Tensor))
17
+ )
18
+ for clz in (torch.nn.CrossEntropyLoss, SoftTargetCrossEntropy): # no longer __repr__ DropPath with drop_prob
19
+ if hasattr(clz, 'extra_repr'):
20
+ clz.extra_repr = _ex_repr
21
+ else:
22
+ clz.__repr__ = lambda self: f'{type(self).__name__}({_ex_repr(self)})'
23
+
24
+ DropPath.__repr__ = lambda self: f'{type(self).__name__}(...)'
25
+
26
+ alias_dict = {}
27
+ for d in range(6, 40+2, 2):
28
+ alias_dict[f'd{d}'] = f'infinity_d{d}'
29
+ alias_dict_inv = {v: k for k, v in alias_dict.items()}
Meissonic/InfinityStar/infinity/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.47 kB). View file