File size: 56,308 Bytes
bc8c4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
qwen_image_series = [
    {
        # Example: ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors")
        "model_hash": "0319a1cb19835fb510907dd3367c95ff",
        "model_name": "qwen_image_dit",
        "model_class": "diffsynth.models.qwen_image_dit.QwenImageDiT",
    },
    {
        # Example: ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
        "model_hash": "8004730443f55db63092006dd9f7110e",
        "model_name": "qwen_image_text_encoder",
        "model_class": "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.qwen_image_text_encoder.QwenImageTextEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
        "model_hash": "ed4ea5824d55ec3107b09815e318123a",
        "model_name": "qwen_image_vae",
        "model_class": "diffsynth.models.qwen_image_vae.QwenImageVAE",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth", origin_file_pattern="model.safetensors")
        "model_hash": "073bce9cf969e317e5662cd570c3e79c",
        "model_name": "qwen_image_blockwise_controlnet",
        "model_class": "diffsynth.models.qwen_image_controlnet.QwenImageBlockWiseControlNet",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint", origin_file_pattern="model.safetensors")
        "model_hash": "a9e54e480a628f0b956a688a81c33bab",
        "model_name": "qwen_image_blockwise_controlnet",
        "model_class": "diffsynth.models.qwen_image_controlnet.QwenImageBlockWiseControlNet",
        "extra_kwargs": {"additional_in_dim": 4},
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors")
        "model_hash": "469c78b61e3e31bc9eec0d0af3d3f2f8",
        "model_name": "siglip2_image_encoder",
        "model_class": "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors")
        "model_hash": "5722b5c873720009de96422993b15682",
        "model_name": "dinov3_image_encoder",
        "model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
    },
    {
        # Example: 
        "model_hash": "a166c33455cdbd89c0888a3645ca5c0f",
        "model_name": "qwen_image_image2lora_coarse",
        "model_class": "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel",
    },
    {
        # Example: 
        "model_hash": "a5476e691767a4da6d3a6634a10f7408",
        "model_name": "qwen_image_image2lora_fine",
        "model_class": "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel",
        "extra_kwargs": {"residual_length": 37*37+7, "residual_mid_dim": 64}
    },
    {
        # Example: 
        "model_hash": "0aad514690602ecaff932c701cb4b0bb",
        "model_name": "qwen_image_image2lora_style",
        "model_class": "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel",
        "extra_kwargs": {"compress_dim": 64, "use_residual": False}
    },
    {
        # Example: ModelConfig(model_id="Qwen/Qwen-Image-Layered", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
        "model_hash": "8dc8cda05de16c73afa755e2c1ce2839",
        "model_name": "qwen_image_dit",
        "model_class": "diffsynth.models.qwen_image_dit.QwenImageDiT",
        "extra_kwargs": {"use_layer3d_rope": True, "use_additional_t_cond": True}
    },
    {
        # Example: ModelConfig(model_id="Qwen/Qwen-Image-Layered", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
        "model_hash": "44b39ddc499e027cfb24f7878d7416b9",
        "model_name": "qwen_image_vae",
        "model_class": "diffsynth.models.qwen_image_vae.QwenImageVAE",
        "extra_kwargs": {"image_channels": 4}
    },
]

wan_series = [
    {
        # Example: ModelConfig(model_id="krea/krea-realtime-video", origin_file_pattern="krea-realtime-video-14b.safetensors")
        "model_hash": "5ec04e02b42d2580483ad69f4e76346a",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth")
        "model_hash": "9c8818c2cbea55eca56c7b447df170da",
        "model_name": "wan_video_text_encoder",
        "model_class": "diffsynth.models.wan_video_text_encoder.WanTextEncoder",
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth")
        "model_hash": "ccc42284ea13e1ad04693284c7a09be6",
        "model_name": "wan_video_vae",
        "model_class": "diffsynth.models.wan_video_vae.WanVideoVAE",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vae.WanVideoVAEStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="meituan-longcat/LongCat-Video", origin_file_pattern="dit/diffusion_pytorch_model*.safetensors")
        "model_hash": "8b27900f680d7251ce44e2dc8ae1ffef",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.longcat_video_dit.LongCatVideoTransformer3DModel",
    },
    {
        # Example: ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
        "model_hash": "5f90e66a0672219f12d9a626c8c21f61",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTFromDiffusers"
    },
    {
        # Example: ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
        "model_hash": "5f90e66a0672219f12d9a626c8c21f61",
        "model_name": "wan_video_vap",
        "model_class": "diffsynth.models.wan_video_mot.MotWanModel",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_mot.WanVideoMotStateDictConverter"
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth")
        "model_hash": "5941c53e207d62f20f9025686193c40b",
        "model_name": "wan_video_image_encoder",
        "model_class": "diffsynth.models.wan_video_image_encoder.WanImageEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_image_encoder.WanImageEncoderStateDictConverter"
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors")
        "model_hash": "dbd5ec76bbf977983f972c151d545389",
        "model_name": "wan_video_motion_controller",
        "model_class": "diffsynth.models.wan_video_motion_controller.WanMotionControllerModel",
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "9269f8db9040a9d860eaca435be61814",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "3ef3b1f8e1dab83d5b71fd7b617f859f",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_image_pos_emb': True}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "349723183fc063b2bfc10bb2835cf677",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "6d6ccde6845b95ad9114ab993d917893",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "efa44cddf936c70abd0ea28b6cbe946c",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "6bfcfb3b342cb286ce886889d519a77e",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "ac6a5aa74f4a0aab6f64eb9a72f19901",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 32, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06, 'has_ref_conv': False, 'add_control_adapter': True, 'in_dim_control_adapter': 24}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "70ddad9d3a133785da5ea371aae09504",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06, 'has_ref_conv': True}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "b61c605c2adbd23124d152ed28e049ae",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 32, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': False, 'add_control_adapter': True, 'in_dim_control_adapter': 24}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "26bde73488a92e64cc20b0a7485b9e5b",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': True}
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "aafcfd9672c3a2456dc46e1cb6e52c70",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06}
    },
    {
        # Example: ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "a61453409b67cd3246cf0c3bebad47ba",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "a61453409b67cd3246cf0c3bebad47ba",
        "model_name": "wan_video_vace",
        "model_class": "diffsynth.models.wan_video_vace.VaceWanModel",
        "extra_kwargs": {"use_target_text_encoder": True},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vace.VaceWanModelDictConverter"
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "7a513e1f257a861512b1afd387a8ecd9",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "7a513e1f257a861512b1afd387a8ecd9",
        "model_name": "wan_video_vace",
        "model_class": "diffsynth.models.wan_video_vace.VaceWanModel",
        "extra_kwargs": {'vace_layers': (0, 5, 10, 15, 20, 25, 30, 35), 'vace_in_dim': 96, 'glyph_channels': 16, 'patch_size': (1, 2, 2), 'has_image_input': False, 'dim': 5120, 'num_heads': 40, 'ffn_dim': 13824, 'eps': 1e-06},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vace.VaceWanModelDictConverter"
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "31fa352acb8a1b1d33cd8764273d80a2",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter"
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "31fa352acb8a1b1d33cd8764273d80a2",
        "model_name": "wan_video_animate_adapter",
        "model_class": "diffsynth.models.wan_video_animate_adapter.WanAnimateAdapter",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_animate_adapter.WanAnimateAdapterStateDictConverter"
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors")
        "model_hash": "47dbeab5e560db3180adf51dc0232fb1",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': False, 'add_control_adapter': True, 'in_dim_control_adapter': 24, 'require_clip_embedding': False}
    },
    {
        # Example: ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors")
        "model_hash": "2267d489f0ceb9f21836532952852ee5",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 52, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': True, 'require_clip_embedding': False},
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors")
        "model_hash": "5b013604280dd715f8457c6ed6d6a626",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'require_clip_embedding': False}
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "966cffdcc52f9c46c391768b27637614",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit_s2v.WanS2VModel",
        "extra_kwargs": {'dim': 5120, 'in_dim': 16, 'ffn_dim': 13824, 'out_dim': 16, 'text_dim': 4096, 'freq_dim': 256, 'eps': 1e-06, 'patch_size': (1, 2, 2), 'num_heads': 40, 'num_layers': 40, 'cond_dim': 16, 'audio_dim': 1024, 'num_audio_token': 4}
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
        "model_hash": "1f5ab7703c6fc803fdded85ff040c316",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 3072, 'ffn_dim': 14336, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 48, 'num_heads': 24, 'num_layers': 30, 'eps': 1e-06, 'seperated_timestep': True, 'require_clip_embedding': False, 'require_vae_embedding': False, 'fuse_vae_embedding_in_latents': True}
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="Wan2.2_VAE.pth")
        "model_hash": "e1de6c02cdac79f8b739f4d3698cd216",
        "model_name": "wan_video_vae",
        "model_class": "diffsynth.models.wan_video_vae.WanVideoVAE38",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vae.WanVideoVAEStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/model.safetensors")
        "model_hash": "06be60f3a4526586d8431cd038a71486",
        "model_name": "wans2v_audio_encoder",
        "model_class": "diffsynth.models.wav2vec.WanS2VAudioEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.wans2v_audio_encoder.WanS2VAudioEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Wan-AI/WanToDance-14B", origin_file_pattern="global_model.safetensors")
        "model_hash": "eb18873fc0ba77b541eb7b62dbcd2059",
        "model_name": "wan_video_dit",
        "model_class": "diffsynth.models.wan_video_dit.WanModel",
        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'wantodance_enable_music_inject': True, 'wantodance_music_inject_layers': [0, 4, 8, 12, 16, 20, 24, 27], 'wantodance_enable_refimage': True, 'has_ref_conv': True, 'wantodance_enable_refface': False, 'wantodance_enable_global': True, 'wantodance_enable_dynamicfps': True, 'wantodance_enable_unimodel': True}
    },
]

flux_series = [
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors")
        "model_hash": "a29710fea6dddb0314663ee823598e50",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
    },
    {
        # Supported due to historical reasons.
        "model_hash": "605c56eab23e9e2af863ad8f0813a25d",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverterFromDiffusers",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors")
        "model_hash": "94eefa3dac9cec93cb1ebaf1747d7b78",
        "model_name": "flux_text_encoder_clip",
        "model_class": "diffsynth.models.flux_text_encoder_clip.FluxTextEncoderClip",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_text_encoder_clip.FluxTextEncoderClipStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors")
        "model_hash": "22540b49eaedbc2f2784b2091a234c7c",
        "model_name": "flux_text_encoder_t5",
        "model_class": "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_text_encoder_t5.FluxTextEncoderT5StateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
        "model_name": "flux_vae_encoder",
        "model_class": "diffsynth.models.flux_vae.FluxVAEEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
        "model_name": "flux_vae_decoder",
        "model_class": "diffsynth.models.flux_vae.FluxVAEDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="ostris/Flex.2-preview", origin_file_pattern="Flex.2-preview.safetensors")
        "model_hash": "d02f41c13549fa5093d3521f62a5570a",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "extra_kwargs": {'input_dim': 196, 'num_blocks': 8},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/AttriCtrl-FLUX.1-Dev", origin_file_pattern="models/brightness.safetensors")
        "model_hash": "0629116fce1472503a66992f96f3eb1a",
        "model_name": "flux_value_controller",
        "model_class": "diffsynth.models.flux_value_control.SingleValueEncoder",
    },
    {
        # Example: ModelConfig(model_id="alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", origin_file_pattern="diffusion_pytorch_model.safetensors")
        "model_hash": "52357cb26250681367488a8954c271e8",
        "model_name": "flux_controlnet",
        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
        "extra_kwargs": {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4},
    },
    {
        # Example: ModelConfig(model_id="InstantX/FLUX.1-dev-Controlnet-Union-alpha", origin_file_pattern="diffusion_pytorch_model.safetensors")
        "model_hash": "78d18b9101345ff695f312e7e62538c0",
        "model_name": "flux_controlnet",
        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
        "extra_kwargs": {"num_mode": 10, "mode_dict": {"canny": 0, "tile": 1, "depth": 2, "blur": 3, "pose": 4, "gray": 5, "lq": 6}},
    },
    {
        # Example: ModelConfig(model_id="jasperai/Flux.1-dev-Controlnet-Upscaler", origin_file_pattern="diffusion_pytorch_model.safetensors")
        "model_hash": "b001c89139b5f053c715fe772362dd2a",
        "model_name": "flux_controlnet",
        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
        "extra_kwargs": {"num_single_blocks": 0},
    },
    {
        # Example: ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/image_proj_model.bin")
        "model_hash": "c07c0f04f5ff55e86b4e937c7a40d481",
        "model_name": "infiniteyou_image_projector",
        "model_class": "diffsynth.models.flux_infiniteyou.InfiniteYouImageProjector",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_infiniteyou.FluxInfiniteYouImageProjectorStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/InfuseNetModel/*.safetensors")
        "model_hash": "7f9583eb8ba86642abb9a21a4b2c9e16",
        "model_name": "flux_controlnet",
        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
        "extra_kwargs": {"num_joint_blocks": 4, "num_single_blocks": 10},
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev", origin_file_pattern="model.safetensors")
        "model_hash": "77c2e4dd2440269eb33bfaa0d004f6ab",
        "model_name": "flux_lora_encoder",
        "model_class": "diffsynth.models.flux_lora_encoder.FluxLoRAEncoder",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev", origin_file_pattern="model.safetensors")
        "model_hash": "30143afb2dea73d1ac580e0787628f8c",
        "model_name": "flux_lora_patcher",
        "model_class": "diffsynth.models.flux_lora_patcher.FluxLoraPatcher",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors")
        "model_hash": "2bd19e845116e4f875a0a048e27fc219",
        "model_name": "nexus_gen_llm",
        "model_class": "diffsynth.models.nexus_gen.NexusGenAutoregressiveModel",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.nexus_gen.NexusGenAutoregressiveModelStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin")
        "model_hash": "63c969fd37cce769a90aa781fbff5f81",
        "model_name": "nexus_gen_editing_adapter",
        "model_class": "diffsynth.models.nexus_gen_projector.NexusGenImageEmbeddingMerger",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.nexus_gen_projector.NexusGenMergerStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin")
        "model_hash": "63c969fd37cce769a90aa781fbff5f81",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="generation_decoder.bin")
        "model_hash": "3e6c61b0f9471135fc9c6d6a98e98b6d",
        "model_name": "nexus_gen_generation_adapter",
        "model_class": "diffsynth.models.nexus_gen_projector.NexusGenAdapter",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.nexus_gen_projector.NexusGenAdapterStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="generation_decoder.bin")
        "model_hash": "3e6c61b0f9471135fc9c6d6a98e98b6d",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="InstantX/FLUX.1-dev-IP-Adapter", origin_file_pattern="ip-adapter.bin")
        "model_hash": "4daaa66cc656a8fe369908693dad0a35",
        "model_name": "flux_ipadapter",
        "model_class": "diffsynth.models.flux_ipadapter.FluxIpAdapter",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_ipadapter.FluxIpAdapterStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="google/siglip-so400m-patch14-384", origin_file_pattern="model.safetensors")
        "model_hash": "04d8c1e20a1f1b25f7434f111992a33f",
        "model_name": "siglip_vision_model",
        "model_class": "diffsynth.models.flux_ipadapter.SiglipVisionModelSO400M",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_ipadapter.SiglipStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="step1x-edit-i1258.safetensors"),
        "model_hash": "d30fb9e02b1dbf4e509142f05cf7dd50",
        "model_name": "step1x_connector",
        "model_class": "diffsynth.models.step1x_connector.Qwen2Connector",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.step1x_connector.Qwen2ConnectorStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="step1x-edit-i1258.safetensors"),
        "model_hash": "d30fb9e02b1dbf4e509142f05cf7dd50",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
        "extra_kwargs": {"disable_guidance_embedder": True},
    },
    {
        # Example: ModelConfig(model_id="MAILAND/majicflus_v1", origin_file_pattern="majicflus_v134.safetensors")
        "model_hash": "3394f306c4cbf04334b712bf5aaed95f",
        "model_name": "flux_dit",
        "model_class": "diffsynth.models.flux_dit.FluxDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
    },
]

flux2_series = [
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="text_encoder/*.safetensors")
        "model_hash": "28fca3d8e5bf2a2d1271748a773f6757",
        "model_name": "flux2_text_encoder",
        "model_class": "diffsynth.models.flux2_text_encoder.Flux2TextEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux2_text_encoder.Flux2TextEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="transformer/*.safetensors")
        "model_hash": "d38e1d5c5aec3b0a11e79327ac6e3b0f",
        "model_name": "flux2_dit",
        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
        "model_hash": "c54288e3ee12ca215898840682337b95",
        "model_name": "flux2_vae",
        "model_class": "diffsynth.models.flux2_vae.Flux2VAE",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="transformer/*.safetensors")
        "model_hash": "3bde7b817fec8143028b6825a63180df",
        "model_name": "flux2_dit",
        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
        "extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 7680, "num_attention_heads": 24, "num_layers": 5, "num_single_layers": 20}
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="text_encoder/*.safetensors")
        "model_hash": "9195f3ea256fcd0ae6d929c203470754",
        "model_name": "z_image_text_encoder",
        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
        "extra_kwargs": {"model_size": "8B"},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="transformer/*.safetensors")
        "model_hash": "39c6fc48f07bebecedbbaa971ff466c8",
        "model_name": "flux2_dit",
        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
        "extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 12288, "num_attention_heads": 32, "num_layers": 8, "num_single_layers": 24}
    },
]

z_image_series = [
    {
        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors")
        "model_hash": "fc3a8a1247fe185ce116ccbe0e426c28",
        "model_name": "z_image_dit",
        "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
    },
    {
        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors")
        "model_hash": "0f050f62a88876fea6eae0a18dac5a2e",
        "model_name": "z_image_text_encoder",
        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
    },
    {
        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/vae/diffusion_pytorch_model.safetensors")
        "model_hash": "1aafa3cc91716fb6b300cc1cd51b85a3",
        "model_name": "flux_vae_encoder",
        "model_class": "diffsynth.models.flux_vae.FluxVAEEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEEncoderStateDictConverterDiffusers",
        "extra_kwargs": {"use_conv_attention": False},
    },
    {
        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/vae/diffusion_pytorch_model.safetensors")
        "model_hash": "1aafa3cc91716fb6b300cc1cd51b85a3",
        "model_name": "flux_vae_decoder",
        "model_class": "diffsynth.models.flux_vae.FluxVAEDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEDecoderStateDictConverterDiffusers",
        "extra_kwargs": {"use_conv_attention": False},
    },
    {
        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="transformer/*.safetensors")
        "model_hash": "aa3563718e5c3ecde3dfbb020ca61180",
        "model_name": "z_image_dit",
        "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
        "extra_kwargs": {"siglip_feat_dim": 1152},
    },
    {
        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="siglip/model.safetensors")
        "model_hash": "89d48e420f45cff95115a9f3e698d44a",
        "model_name": "siglip_vision_model_428m",
        "model_class": "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M",
    },
    {
        # Example: ModelConfig(model_id="PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1", origin_file_pattern="Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors")
        "model_hash": "1677708d40029ab380a95f6c731a57d7",
        "model_name": "z_image_controlnet",
        "model_class": "diffsynth.models.z_image_controlnet.ZImageControlNet",
    },
    {
        # Example: ???
        "model_hash": "9510cb8cd1dd34ee0e4f111c24905510",
        "model_name": "z_image_image2lora_style",
        "model_class": "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel",
        "extra_kwargs": {"compress_dim": 128},
    },
    {
        # Example: ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors")
        "model_hash": "1392adecee344136041e70553f875f31",
        "model_name": "z_image_text_encoder",
        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
        "extra_kwargs": {"model_size": "0.6B"},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
    },
    {
        # To ensure compatibility with the `model.diffusion_model` prefix introduced by other frameworks.
        "model_hash": "8cf241a0d32f93d5de368502a086852f",
        "model_name": "z_image_dit",
        "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_dit.ZImageDiTStateDictConverter",
    },
]
"""
Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2
Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage
For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors"))
and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported.
We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules,
and avoid redundant memory usage when users only want to use part of the model.
"""
ltx2_series = [
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_dit",
        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors")
        "model_hash": "c567aaa37d5ed7454c73aa6024458661",
        "model_name": "ltx2_dit",
        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_video_vae_encoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors")
        "model_hash": "7f7e904a53260ec0351b05f32153754b",
        "model_name": "ltx2_video_vae_encoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_video_vae_decoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors")
        "model_hash": "dc6029ca2825147872b45e35a2dc3a97",
        "model_name": "ltx2_video_vae_decoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_audio_vae_decoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors")
        "model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
        "model_name": "ltx2_audio_vae_decoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_audio_vocoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2Vocoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors")
        "model_hash": "f471360f6b24bef702ab73133d9f8bb9",
        "model_name": "ltx2_audio_vocoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2Vocoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_audio_vae_encoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
        "model_hash": "29338f3b95e7e312a3460a482e4f4554",
        "model_name": "ltx2_audio_vae_encoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
        "model_name": "ltx2_text_encoder_post_modules",
        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors")
        "model_hash": "981629689c8be92a712ab3c5eb4fc3f6",
        "model_name": "ltx2_text_encoder_post_modules",
        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors")
        "model_hash": "33917f31c4a79196171154cca39f165e",
        "model_name": "ltx2_text_encoder",
        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
        "model_hash": "c79c458c6e99e0e14d47e676761732d2",
        "model_name": "ltx2_latent_upsampler",
        "model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_dit",
        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
        "extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_video_vae_encoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
        "extra_kwargs": {"encoder_version": "ltx-2.3"},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_video_vae_decoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
        "extra_kwargs": {"decoder_version": "ltx-2.3"},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_audio_vae_decoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_audio_vocoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_audio_vae_encoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
        "model_name": "ltx2_text_encoder_post_modules",
        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
        "extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
        "model_hash": "aed408774d694a2452f69936c32febb5",
        "model_name": "ltx2_latent_upsampler",
        "model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
        "extra_kwargs": {"rational_resampler": False},
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="transformer.safetensors")
        "model_hash": "1c55afad76ed33c112a2978550b524d1",
        "model_name": "ltx2_dit",
        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
        "extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_encoder.safetensors")
        "model_hash": "eecdc07c2ec30863b8a2b8b2134036cf",
        "model_name": "ltx2_video_vae_encoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
        "extra_kwargs": {"encoder_version": "ltx-2.3"},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_decoder.safetensors")
        "model_hash": "deda2f542e17ee25bc8c38fd605316ea",
        "model_name": "ltx2_video_vae_decoder",
        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
        "extra_kwargs": {"decoder_version": "ltx-2.3"},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
        "model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
        "model_name": "ltx2_audio_vae_decoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
        "model_hash": "29338f3b95e7e312a3460a482e4f4554",
        "model_name": "ltx2_audio_vae_encoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
        "model_hash": "cd436c99e69ec5c80f050f0944f02a15",
        "model_name": "ltx2_audio_vocoder",
        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
    },
    {
        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors")
        "model_hash": "05da2aab1c4b061f72c426311c165a43",
        "model_name": "ltx2_text_encoder_post_modules",
        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
        "extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
    },
]
anima_series = [
    {
        # Example: ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/vae/qwen_image_vae.safetensors")
        "model_hash": "a9995952c2d8e63cf82e115005eb61b9",
        "model_name": "z_image_text_encoder",
        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
        "extra_kwargs": {"model_size": "0.6B"},
    },
    {
        # Example: ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/diffusion_models/anima-preview.safetensors")
        "model_hash": "417673936471e79e31ed4d186d7a3f4a",
        "model_name": "anima_dit",
        "model_class": "diffsynth.models.anima_dit.AnimaDiT",
        "state_dict_converter": "diffsynth.utils.state_dict_converters.anima_dit.AnimaDiTStateDictConverter",
    }
]

mova_series = [
    # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors")
    {
        "model_hash": "8c57e12790e2c45a64817e0ce28cde2f",
        "model_name": "mova_audio_dit",
        "model_class": "diffsynth.models.mova_audio_dit.MovaAudioDit",
        "extra_kwargs": {'has_image_input': False, 'patch_size': [1], 'in_dim': 128, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 128, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
    },
    # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors")
    {
        "model_hash": "418517fb2b4e919d2cac8f314fcf82ac",
        "model_name": "mova_audio_vae",
        "model_class": "diffsynth.models.mova_audio_vae.DacVAE",
    },
    # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors")
    {
        "model_hash": "d1139dbbc8b4ab53cf4b4243d57bbceb",
        "model_name": "mova_dual_tower_bridge",
        "model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
    },
]
MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series + mova_series