xiaoanyu123 commited on
Commit
71665e5
·
verified ·
1 Parent(s): 8691793

Add files using upload-large-folder tool

Browse files
Files changed (21) hide show
  1. .gitattributes +4 -0
  2. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_adv_train64_8.lib +0 -0
  3. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_infer.lib +3 -0
  4. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_infer64_8.lib +3 -0
  5. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_train.lib +3 -0
  6. cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_train64_8.lib +3 -0
  7. pythonProject/.venv/Lib/site-packages/accelerate/commands/menu/__pycache__/__init__.cpython-310.pyc +0 -0
  8. pythonProject/.venv/Lib/site-packages/accelerate/commands/menu/__pycache__/keymap.cpython-310.pyc +0 -0
  9. pythonProject/.venv/Lib/site-packages/accelerate/commands/menu/__pycache__/selection_menu.cpython-310.pyc +0 -0
  10. pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/INSTALLER +1 -0
  11. pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/METADATA +750 -0
  12. pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/RECORD +35 -0
  13. pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/WHEEL +5 -0
  14. pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/licenses/LICENSE +21 -0
  15. pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/top_level.txt +1 -0
  16. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
  17. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +755 -0
  18. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  19. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +980 -0
  20. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuandit/__pycache__/__init__.cpython-310.pyc +0 -0
  21. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuandit/__pycache__/pipeline_hunyuandit.cpython-310.pyc +0 -0
.gitattributes CHANGED
@@ -42,3 +42,7 @@ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer.lib filter=
42
  cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib filter=lfs diff=lfs merge=lfs -text
43
  cudnn-windows-x86_64-8.9.5.30_cuda11-archive/bin/cudnn64_8.dll filter=lfs diff=lfs merge=lfs -text
44
  pythonProject/.venv/Lib/site-packages/accelerate/utils/__pycache__/dataclasses.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
42
  cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_ops_infer64_8.lib filter=lfs diff=lfs merge=lfs -text
43
  cudnn-windows-x86_64-8.9.5.30_cuda11-archive/bin/cudnn64_8.dll filter=lfs diff=lfs merge=lfs -text
44
  pythonProject/.venv/Lib/site-packages/accelerate/utils/__pycache__/dataclasses.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
45
+ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_infer.lib filter=lfs diff=lfs merge=lfs -text
46
+ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_infer64_8.lib filter=lfs diff=lfs merge=lfs -text
47
+ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_train.lib filter=lfs diff=lfs merge=lfs -text
48
+ cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_train64_8.lib filter=lfs diff=lfs merge=lfs -text
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_adv_train64_8.lib ADDED
Binary file (30.9 kB). View file
 
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_infer.lib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957fb42d0fc58c5d043209617ce0ae06ce609d12e8bbab31e3586e93c626f520
3
+ size 2852438
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_infer64_8.lib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957fb42d0fc58c5d043209617ce0ae06ce609d12e8bbab31e3586e93c626f520
3
+ size 2852438
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_train.lib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69aad4a93f31f68aa34519159b792642150a8a775505a1ed3002110cd03de0fb
3
+ size 1217752
cudnn-windows-x86_64-8.9.5.30_cuda11-archive/lib/x64/cudnn_cnn_train64_8.lib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69aad4a93f31f68aa34519159b792642150a8a775505a1ed3002110cd03de0fb
3
+ size 1217752
pythonProject/.venv/Lib/site-packages/accelerate/commands/menu/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (230 Bytes). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/commands/menu/__pycache__/keymap.cpython-310.pyc ADDED
Binary file (2.39 kB). View file
 
pythonProject/.venv/Lib/site-packages/accelerate/commands/menu/__pycache__/selection_menu.cpython-310.pyc ADDED
Binary file (4.42 kB). View file
 
pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/METADATA ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: charset-normalizer
3
+ Version: 3.4.3
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
6
+ Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
7
+ License: MIT
8
+ Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
+ Project-URL: Code, https://github.com/jawah/charset_normalizer
11
+ Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Programming Language :: Python :: 3.14
26
+ Classifier: Programming Language :: Python :: 3 :: Only
27
+ Classifier: Programming Language :: Python :: Implementation :: CPython
28
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
29
+ Classifier: Topic :: Text Processing :: Linguistic
30
+ Classifier: Topic :: Utilities
31
+ Classifier: Typing :: Typed
32
+ Requires-Python: >=3.7
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Provides-Extra: unicode-backport
36
+ Dynamic: license-file
37
+
38
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
39
+
40
+ <p align="center">
41
+ <sup>The Real First Universal Charset Detector</sup><br>
42
+ <a href="https://pypi.org/project/charset-normalizer">
43
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
44
+ </a>
45
+ <a href="https://pepy.tech/project/charset-normalizer/">
46
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
47
+ </a>
48
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
49
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
50
+ </a>
51
+ </p>
52
+ <p align="center">
53
+ <sup><i>Featured Packages</i></sup><br>
54
+ <a href="https://github.com/jawah/niquests">
55
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
56
+ </a>
57
+ <a href="https://github.com/jawah/wassima">
58
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
59
+ </a>
60
+ </p>
61
+ <p align="center">
62
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
63
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
64
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
65
+ </a>
66
+ </p>
67
+
68
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
69
+ > I'm trying to resolve the issue by taking a new approach.
70
+ > All IANA character set names for which the Python core library provides codecs are supported.
71
+
72
+ <p align="center">
73
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
74
+ </p>
75
+
76
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
77
+
78
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
79
+ |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
80
+ | `Fast` | ❌ | ✅ | ✅ |
81
+ | `Universal**` | ❌ | ✅ | ❌ |
82
+ | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
83
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
84
+ | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
85
+ | `Native Python` | ✅ | ✅ | ❌ |
86
+ | `Detect spoken language` | ❌ | ✅ | N/A |
87
+ | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
88
+ | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
89
+ | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
90
+
91
+ <p align="center">
92
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
93
+ </p>
94
+
95
+ *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
96
+
97
+ ## ⚡ Performance
98
+
99
+ This package offer better performance than its counterpart Chardet. Here are some numbers.
100
+
101
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
102
+ |-----------------------------------------------|:--------:|:------------------:|:------------------:|
103
+ | [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
104
+ | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
105
+
106
+ | Package | 99th percentile | 95th percentile | 50th percentile |
107
+ |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
108
+ | [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
109
+ | charset-normalizer | 100 ms | 50 ms | 5 ms |
110
+
111
+ _updated as of december 2024 using CPython 3.12_
112
+
113
+ Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
114
+
115
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
116
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
117
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
118
+ > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
119
+ > (e.g. Supported Encoding) Challenge-them if you want.
120
+
121
+ ## ✨ Installation
122
+
123
+ Using pip:
124
+
125
+ ```sh
126
+ pip install charset-normalizer -U
127
+ ```
128
+
129
+ ## 🚀 Basic Usage
130
+
131
+ ### CLI
132
+ This package comes with a CLI.
133
+
134
+ ```
135
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
136
+ file [file ...]
137
+
138
+ The Real First Universal Charset Detector. Discover originating encoding used
139
+ on text file. Normalize text to unicode.
140
+
141
+ positional arguments:
142
+ files File(s) to be analysed
143
+
144
+ optional arguments:
145
+ -h, --help show this help message and exit
146
+ -v, --verbose Display complementary information about file if any.
147
+ Stdout will contain logs about the detection process.
148
+ -a, --with-alternative
149
+ Output complementary possibilities if any. Top-level
150
+ JSON WILL be a list.
151
+ -n, --normalize Permit to normalize input file. If not set, program
152
+ does not write anything.
153
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
154
+ JSON output.
155
+ -r, --replace Replace file when trying to normalize it instead of
156
+ creating a new one.
157
+ -f, --force Replace file without asking if you are sure, use this
158
+ flag with caution.
159
+ -t THRESHOLD, --threshold THRESHOLD
160
+ Define a custom maximum amount of chaos allowed in
161
+ decoded content. 0. <= chaos <= 1.
162
+ --version Show version information and exit.
163
+ ```
164
+
165
+ ```bash
166
+ normalizer ./data/sample.1.fr.srt
167
+ ```
168
+
169
+ or
170
+
171
+ ```bash
172
+ python -m charset_normalizer ./data/sample.1.fr.srt
173
+ ```
174
+
175
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
176
+
177
+ ```json
178
+ {
179
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
180
+ "encoding": "cp1252",
181
+ "encoding_aliases": [
182
+ "1252",
183
+ "windows_1252"
184
+ ],
185
+ "alternative_encodings": [
186
+ "cp1254",
187
+ "cp1256",
188
+ "cp1258",
189
+ "iso8859_14",
190
+ "iso8859_15",
191
+ "iso8859_16",
192
+ "iso8859_3",
193
+ "iso8859_9",
194
+ "latin_1",
195
+ "mbcs"
196
+ ],
197
+ "language": "French",
198
+ "alphabets": [
199
+ "Basic Latin",
200
+ "Latin-1 Supplement"
201
+ ],
202
+ "has_sig_or_bom": false,
203
+ "chaos": 0.149,
204
+ "coherence": 97.152,
205
+ "unicode_path": null,
206
+ "is_preferred": true
207
+ }
208
+ ```
209
+
210
+ ### Python
211
+ *Just print out normalized text*
212
+ ```python
213
+ from charset_normalizer import from_path
214
+
215
+ results = from_path('./my_subtitle.srt')
216
+
217
+ print(str(results.best()))
218
+ ```
219
+
220
+ *Upgrade your code without effort*
221
+ ```python
222
+ from charset_normalizer import detect
223
+ ```
224
+
225
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
226
+
227
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
228
+
229
+ ## 😇 Why
230
+
231
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
232
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
233
+
234
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
235
+ produce **two identical rendered string.**
236
+ What I want is to get readable text, the best I can.
237
+
238
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
239
+
240
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
241
+
242
+ ## 🍰 How
243
+
244
+ - Discard all charset encoding table that could not fit the binary content.
245
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
246
+ - Extract matches with the lowest mess detected.
247
+ - Additionally, we measure coherence / probe for a language.
248
+
249
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
250
+
251
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
252
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
253
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
254
+ improve or rewrite it.
255
+
256
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
257
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
258
+
259
+ ## ⚡ Known limitations
260
+
261
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
262
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
263
+
264
+ ## ⚠️ About Python EOLs
265
+
266
+ **If you are running:**
267
+
268
+ - Python >=2.7,<3.5: Unsupported
269
+ - Python 3.5: charset-normalizer < 2.1
270
+ - Python 3.6: charset-normalizer < 3.1
271
+ - Python 3.7: charset-normalizer < 4.0
272
+
273
+ Upgrade your Python interpreter as soon as possible.
274
+
275
+ ## 👤 Contributing
276
+
277
+ Contributions, issues and feature requests are very much welcome.<br />
278
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
279
+
280
+ ## 📝 License
281
+
282
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
283
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
284
+
285
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
286
+
287
+ ## 💼 For Enterprise
288
+
289
+ Professional support for charset-normalizer is available as part of the [Tidelift
290
+ Subscription][1]. Tidelift gives software development teams a single source for
291
+ purchasing and maintaining their software, with professional grade assurances
292
+ from the experts who know it best, while seamlessly integrating with existing
293
+ tools.
294
+
295
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
296
+
297
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
298
+
299
+ # Changelog
300
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
301
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
302
+
303
+ ## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
304
+
305
+ ### Changed
306
+ - mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
307
+ - automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
308
+
309
+ ### Added
310
+ - Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
311
+ - Support for Python 3.14
312
+
313
+ ### Fixed
314
+ - sdist archive contained useless directories.
315
+ - automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
316
+
317
+ ### Misc
318
+ - SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
319
+ Each published wheel comes with its SBOM. We choose CycloneDX as the format.
320
+ - Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
321
+
322
+ ## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
323
+
324
+ ### Fixed
325
+ - Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
326
+ - Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
327
+
328
+ ### Changed
329
+ - Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
330
+
331
+ ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
332
+
333
+ ### Changed
334
+ - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
335
+ - Enforce annotation delayed loading for a simpler and consistent types in the project.
336
+ - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
337
+
338
+ ### Added
339
+ - pre-commit configuration.
340
+ - noxfile.
341
+
342
+ ### Removed
343
+ - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
344
+ - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
345
+ - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
346
+ - Unused `utils.range_scan` function.
347
+
348
+ ### Fixed
349
+ - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
350
+ - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
351
+
352
+ ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
353
+
354
+ ### Added
355
+ - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
356
+ - Support for Python 3.13 (#512)
357
+
358
+ ### Fixed
359
+ - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
360
+ - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
361
+ - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
362
+
363
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
364
+
365
+ ### Fixed
366
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
367
+ - Regression on some detection case showcased in the documentation (#371)
368
+
369
+ ### Added
370
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
371
+
372
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
373
+
374
+ ### Changed
375
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
376
+ - Improved the general detection reliability based on reports from the community
377
+
378
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
379
+
380
+ ### Added
381
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
382
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
383
+
384
+ ### Removed
385
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
386
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
387
+
388
+ ### Changed
389
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
390
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
391
+
392
+ ### Fixed
393
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
394
+
395
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
396
+
397
+ ### Changed
398
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
399
+ - Minor improvement over the global detection reliability
400
+
401
+ ### Added
402
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
403
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
404
+ - Explicit support for Python 3.12
405
+
406
+ ### Fixed
407
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
408
+
409
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
410
+
411
+ ### Added
412
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
413
+
414
+ ### Removed
415
+ - Support for Python 3.6 (PR #260)
416
+
417
+ ### Changed
418
+ - Optional speedup provided by mypy/c 1.0.1
419
+
420
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
421
+
422
+ ### Fixed
423
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
424
+
425
+ ### Changed
426
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
427
+
428
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
429
+
430
+ ### Added
431
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
432
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
433
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
434
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
435
+
436
+ ### Changed
437
+ - Build with static metadata using 'build' frontend
438
+ - Make the language detection stricter
439
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
440
+
441
+ ### Fixed
442
+ - CLI with opt --normalize fail when using full path for files
443
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
444
+ - Sphinx warnings when generating the documentation
445
+
446
+ ### Removed
447
+ - Coherence detector no longer return 'Simple English' instead return 'English'
448
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
449
+ - Breaking: Method `first()` and `best()` from CharsetMatch
450
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
451
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
452
+ - Breaking: Top-level function `normalize`
453
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
454
+ - Support for the backport `unicodedata2`
455
+
456
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
457
+
458
+ ### Added
459
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
460
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
461
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
462
+
463
+ ### Changed
464
+ - Build with static metadata using 'build' frontend
465
+ - Make the language detection stricter
466
+
467
+ ### Fixed
468
+ - CLI with opt --normalize fail when using full path for files
469
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
470
+
471
+ ### Removed
472
+ - Coherence detector no longer return 'Simple English' instead return 'English'
473
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
474
+
475
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
476
+
477
+ ### Added
478
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
479
+
480
+ ### Removed
481
+ - Breaking: Method `first()` and `best()` from CharsetMatch
482
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
483
+
484
+ ### Fixed
485
+ - Sphinx warnings when generating the documentation
486
+
487
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
488
+
489
+ ### Changed
490
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
491
+
492
+ ### Removed
493
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
494
+ - Breaking: Top-level function `normalize`
495
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
496
+ - Support for the backport `unicodedata2`
497
+
498
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
499
+
500
+ ### Deprecated
501
+ - Function `normalize` scheduled for removal in 3.0
502
+
503
+ ### Changed
504
+ - Removed useless call to decode in fn is_unprintable (#206)
505
+
506
+ ### Fixed
507
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
508
+
509
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
510
+
511
+ ### Added
512
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
513
+
514
+ ### Changed
515
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
516
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
517
+
518
+ ### Fixed
519
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
520
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
521
+
522
+ ### Removed
523
+ - Support for Python 3.5 (PR #192)
524
+
525
+ ### Deprecated
526
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
527
+
528
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
529
+
530
+ ### Fixed
531
+ - ASCII miss-detection on rare cases (PR #170)
532
+
533
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
534
+
535
+ ### Added
536
+ - Explicit support for Python 3.11 (PR #164)
537
+
538
+ ### Changed
539
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
540
+
541
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
542
+
543
+ ### Fixed
544
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
545
+
546
+ ### Changed
547
+ - Skipping the language-detection (CD) on ASCII (PR #155)
548
+
549
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
550
+
551
+ ### Changed
552
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
553
+
554
+ ### Fixed
555
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
556
+
557
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
558
+ ### Changed
559
+ - Improvement over Vietnamese detection (PR #126)
560
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
561
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
562
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
563
+ - Code style as refactored by Sourcery-AI (PR #131)
564
+ - Minor adjustment on the MD around european words (PR #133)
565
+ - Remove and replace SRTs from assets / tests (PR #139)
566
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
567
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
568
+
569
+ ### Fixed
570
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
571
+ - Avoid using too insignificant chunk (PR #137)
572
+
573
+ ### Added
574
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
575
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
576
+
577
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
578
+ ### Added
579
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
580
+
581
+ ### Changed
582
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
583
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
584
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
585
+ - Various detection improvement (MD+CD) (PR #117)
586
+
587
+ ### Removed
588
+ - Remove redundant logging entry about detected language(s) (PR #115)
589
+
590
+ ### Fixed
591
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
592
+
593
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
594
+ ### Fixed
595
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
596
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
597
+
598
+ ### Changed
599
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
600
+
601
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
602
+ ### Changed
603
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
604
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
605
+ - The Unicode detection is slightly improved (PR #93)
606
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
607
+
608
+ ### Removed
609
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
610
+
611
+ ### Fixed
612
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
613
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
614
+ - The MANIFEST.in was not exhaustive (PR #78)
615
+
616
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
617
+ ### Fixed
618
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
619
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
620
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
621
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
622
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
623
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
624
+
625
+ ### Changed
626
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
627
+ - Allow fallback on specified encoding if any (PR #71)
628
+
629
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
630
+ ### Changed
631
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
632
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
633
+
634
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
635
+ ### Fixed
636
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
637
+
638
+ ### Changed
639
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
640
+
641
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
642
+ ### Fixed
643
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
644
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
645
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
646
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
647
+
648
+ ### Changed
649
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
650
+
651
+ ### Added
652
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
653
+
654
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
655
+ ### Changed
656
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
657
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
658
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
659
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
660
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
661
+ - utf_7 detection has been reinstated.
662
+
663
+ ### Removed
664
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
665
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
666
+ - The exception hook on UnicodeDecodeError has been removed.
667
+
668
+ ### Deprecated
669
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
670
+
671
+ ### Fixed
672
+ - The CLI output used the relative path of the file(s). Should be absolute.
673
+
674
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
675
+ ### Fixed
676
+ - Logger configuration/usage no longer conflict with others (PR #44)
677
+
678
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
679
+ ### Removed
680
+ - Using standard logging instead of using the package loguru.
681
+ - Dropping nose test framework in favor of the maintained pytest.
682
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
683
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
684
+ - Stop support for UTF-7 that does not contain a SIG.
685
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
686
+
687
+ ### Fixed
688
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
689
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
690
+
691
+ ### Changed
692
+ - Improving the package final size by compressing frequencies.json.
693
+ - Huge improvement over the larges payload.
694
+
695
+ ### Added
696
+ - CLI now produces JSON consumable output.
697
+ - Return ASCII if given sequences fit. Given reasonable confidence.
698
+
699
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
700
+
701
+ ### Fixed
702
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
703
+
704
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
705
+
706
+ ### Fixed
707
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
708
+
709
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
710
+
711
+ ### Fixed
712
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
713
+
714
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
715
+
716
+ ### Changed
717
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
718
+
719
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
720
+
721
+ ### Fixed
722
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
723
+
724
+ ### Changed
725
+ - Dependencies refactoring, constraints revised.
726
+
727
+ ### Added
728
+ - Add python 3.9 and 3.10 to the supported interpreters
729
+
730
+ MIT License
731
+
732
+ Copyright (c) 2025 TAHRI Ahmed R.
733
+
734
+ Permission is hereby granted, free of charge, to any person obtaining a copy
735
+ of this software and associated documentation files (the "Software"), to deal
736
+ in the Software without restriction, including without limitation the rights
737
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
738
+ copies of the Software, and to permit persons to whom the Software is
739
+ furnished to do so, subject to the following conditions:
740
+
741
+ The above copyright notice and this permission notice shall be included in all
742
+ copies or substantial portions of the Software.
743
+
744
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
745
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
746
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
747
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
748
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
749
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
750
+ SOFTWARE.
pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/RECORD ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../Scripts/normalizer.exe,sha256=BsPzI8MuFiVYFkh7wCZGehs-mT7T9f8cK_aAoTNnH6M,108423
2
+ charset_normalizer-3.4.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
+ charset_normalizer-3.4.3.dist-info/METADATA,sha256=tqX3UoI-UkqIN99aZsk646yI4NgMbu1MjlKr6BbITG4,37450
4
+ charset_normalizer-3.4.3.dist-info/RECORD,,
5
+ charset_normalizer-3.4.3.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
6
+ charset_normalizer-3.4.3.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
7
+ charset_normalizer-3.4.3.dist-info/licenses/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
8
+ charset_normalizer-3.4.3.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
+ charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
10
+ charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
11
+ charset_normalizer/__pycache__/__init__.cpython-310.pyc,,
12
+ charset_normalizer/__pycache__/__main__.cpython-310.pyc,,
13
+ charset_normalizer/__pycache__/api.cpython-310.pyc,,
14
+ charset_normalizer/__pycache__/cd.cpython-310.pyc,,
15
+ charset_normalizer/__pycache__/constant.cpython-310.pyc,,
16
+ charset_normalizer/__pycache__/legacy.cpython-310.pyc,,
17
+ charset_normalizer/__pycache__/md.cpython-310.pyc,,
18
+ charset_normalizer/__pycache__/models.cpython-310.pyc,,
19
+ charset_normalizer/__pycache__/utils.cpython-310.pyc,,
20
+ charset_normalizer/__pycache__/version.cpython-310.pyc,,
21
+ charset_normalizer/api.py,sha256=ODy4hX78b3ldTl5sViYPU1yzQ5qkclfgSIFE8BtNrTI,23337
22
+ charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
23
+ charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
24
+ charset_normalizer/cli/__main__.py,sha256=-pdJCyPywouPyFsC8_eTSgTmvh1YEvgjsvy1WZ0XjaA,13027
25
+ charset_normalizer/cli/__pycache__/__init__.cpython-310.pyc,,
26
+ charset_normalizer/cli/__pycache__/__main__.cpython-310.pyc,,
27
+ charset_normalizer/constant.py,sha256=mCJmYzpBU27Ut9kiNWWoBbhhxQ-aRVw3K7LSwoFwBGI,44728
28
+ charset_normalizer/legacy.py,sha256=ui08NlKqAXU3Y7smK-NFJjEgRRQz9ruM7aNCbT0OOrE,2811
29
+ charset_normalizer/md.cp310-win_amd64.pyd,sha256=BQ208ayzKOrtZHPb785b5Hgvw5tc2WszcfHritUOPnw,10752
30
+ charset_normalizer/md.py,sha256=LSuW2hNgXSgF7JGdRapLAHLuj6pABHiP85LTNAYmu7c,20780
31
+ charset_normalizer/md__mypyc.cp310-win_amd64.pyd,sha256=PZHNdte6DpklIoi1GRxQ21vg2eLyv1_q1dx7v_9yui0,125952
32
+ charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
33
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ charset_normalizer/utils.py,sha256=XtWIQeOuz7cnGebMzyi4Vvi1JtA84QBSIeR9PDzF7pw,12584
35
+ charset_normalizer/version.py,sha256=laniWEeVCCfwRgYLf_rZ2f0qWaNwWTEXQEfUUL_MMvw,123
pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp310-cp310-win_amd64
5
+
pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pythonProject/.venv/Lib/site-packages/charset_normalizer-3.4.3.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ charset_normalizer
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py ADDED
@@ -0,0 +1,804 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ from transformers import CLIPTextModel, CLIPTokenizer, LlamaModel, LlamaTokenizerFast
21
+
22
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
23
+ from ...image_processor import PipelineImageInput
24
+ from ...loaders import HunyuanVideoLoraLoaderMixin
25
+ from ...models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel
26
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
27
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
28
+ from ...utils.torch_utils import randn_tensor
29
+ from ...video_processor import VideoProcessor
30
+ from ..pipeline_utils import DiffusionPipeline
31
+ from .pipeline_output import HunyuanVideoPipelineOutput
32
+
33
+
34
+ if is_torch_xla_available():
35
+ import torch_xla.core.xla_model as xm
36
+
37
+ XLA_AVAILABLE = True
38
+ else:
39
+ XLA_AVAILABLE = False
40
+
41
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
+
43
+
44
+ EXAMPLE_DOC_STRING = """
45
+ Examples:
46
+ ```python
47
+ >>> import torch
48
+ >>> from diffusers import HunyuanSkyreelsImageToVideoPipeline, HunyuanVideoTransformer3DModel
49
+ >>> from diffusers.utils import load_image, export_to_video
50
+
51
+ >>> model_id = "hunyuanvideo-community/HunyuanVideo"
52
+ >>> transformer_model_id = "Skywork/SkyReels-V1-Hunyuan-I2V"
53
+ >>> transformer = HunyuanVideoTransformer3DModel.from_pretrained(
54
+ ... transformer_model_id, torch_dtype=torch.bfloat16
55
+ ... )
56
+ >>> pipe = HunyuanSkyreelsImageToVideoPipeline.from_pretrained(
57
+ ... model_id, transformer=transformer, torch_dtype=torch.float16
58
+ ... )
59
+ >>> pipe.vae.enable_tiling()
60
+ >>> pipe.to("cuda")
61
+
62
+ >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
63
+ >>> negative_prompt = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
64
+ >>> image = load_image(
65
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
66
+ ... )
67
+
68
+ >>> output = pipe(
69
+ ... image=image,
70
+ ... prompt=prompt,
71
+ ... negative_prompt=negative_prompt,
72
+ ... num_inference_steps=30,
73
+ ... true_cfg_scale=6.0,
74
+ ... guidance_scale=1.0,
75
+ ... ).frames[0]
76
+ >>> export_to_video(output, "output.mp4", fps=15)
77
+ ```
78
+ """
79
+
80
+
81
+ DEFAULT_PROMPT_TEMPLATE = {
82
+ "template": (
83
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
84
+ "1. The main content and theme of the video."
85
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
86
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
87
+ "4. background environment, light, style and atmosphere."
88
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
89
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
90
+ ),
91
+ "crop_start": 95,
92
+ }
93
+
94
+
95
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
96
+ def retrieve_timesteps(
97
+ scheduler,
98
+ num_inference_steps: Optional[int] = None,
99
+ device: Optional[Union[str, torch.device]] = None,
100
+ timesteps: Optional[List[int]] = None,
101
+ sigmas: Optional[List[float]] = None,
102
+ **kwargs,
103
+ ):
104
+ r"""
105
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
106
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
107
+
108
+ Args:
109
+ scheduler (`SchedulerMixin`):
110
+ The scheduler to get timesteps from.
111
+ num_inference_steps (`int`):
112
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
113
+ must be `None`.
114
+ device (`str` or `torch.device`, *optional*):
115
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
116
+ timesteps (`List[int]`, *optional*):
117
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
118
+ `num_inference_steps` and `sigmas` must be `None`.
119
+ sigmas (`List[float]`, *optional*):
120
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
121
+ `num_inference_steps` and `timesteps` must be `None`.
122
+
123
+ Returns:
124
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
125
+ second element is the number of inference steps.
126
+ """
127
+ if timesteps is not None and sigmas is not None:
128
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
129
+ if timesteps is not None:
130
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
131
+ if not accepts_timesteps:
132
+ raise ValueError(
133
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
134
+ f" timestep schedules. Please check whether you are using the correct scheduler."
135
+ )
136
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
137
+ timesteps = scheduler.timesteps
138
+ num_inference_steps = len(timesteps)
139
+ elif sigmas is not None:
140
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
141
+ if not accept_sigmas:
142
+ raise ValueError(
143
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
144
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
145
+ )
146
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
147
+ timesteps = scheduler.timesteps
148
+ num_inference_steps = len(timesteps)
149
+ else:
150
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
151
+ timesteps = scheduler.timesteps
152
+ return timesteps, num_inference_steps
153
+
154
+
155
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
156
+ def retrieve_latents(
157
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
158
+ ):
159
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
160
+ return encoder_output.latent_dist.sample(generator)
161
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
162
+ return encoder_output.latent_dist.mode()
163
+ elif hasattr(encoder_output, "latents"):
164
+ return encoder_output.latents
165
+ else:
166
+ raise AttributeError("Could not access latents of provided encoder_output")
167
+
168
+
169
+ class HunyuanSkyreelsImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
170
+ r"""
171
+ Pipeline for image-to-video generation using HunyuanVideo.
172
+
173
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
174
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
175
+
176
+ Args:
177
+ text_encoder ([`LlamaModel`]):
178
+ [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
179
+ tokenizer (`LlamaTokenizer`):
180
+ Tokenizer from [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
181
+ transformer ([`HunyuanVideoTransformer3DModel`]):
182
+ Conditional Transformer to denoise the encoded image latents.
183
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
184
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
185
+ vae ([`AutoencoderKLHunyuanVideo`]):
186
+ Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
187
+ text_encoder_2 ([`CLIPTextModel`]):
188
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
189
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
190
+ tokenizer_2 (`CLIPTokenizer`):
191
+ Tokenizer of class
192
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
193
+ """
194
+
195
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
196
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
197
+
198
+ def __init__(
199
+ self,
200
+ text_encoder: LlamaModel,
201
+ tokenizer: LlamaTokenizerFast,
202
+ transformer: HunyuanVideoTransformer3DModel,
203
+ vae: AutoencoderKLHunyuanVideo,
204
+ scheduler: FlowMatchEulerDiscreteScheduler,
205
+ text_encoder_2: CLIPTextModel,
206
+ tokenizer_2: CLIPTokenizer,
207
+ ):
208
+ super().__init__()
209
+
210
+ self.register_modules(
211
+ vae=vae,
212
+ text_encoder=text_encoder,
213
+ tokenizer=tokenizer,
214
+ transformer=transformer,
215
+ scheduler=scheduler,
216
+ text_encoder_2=text_encoder_2,
217
+ tokenizer_2=tokenizer_2,
218
+ )
219
+
220
+ self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
221
+ self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 8
222
+ self.vae_scaling_factor = self.vae.config.scaling_factor if getattr(self, "vae", None) else 0.476986
223
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
224
+
225
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline._get_llama_prompt_embeds
226
+ def _get_llama_prompt_embeds(
227
+ self,
228
+ prompt: Union[str, List[str]],
229
+ prompt_template: Dict[str, Any],
230
+ num_videos_per_prompt: int = 1,
231
+ device: Optional[torch.device] = None,
232
+ dtype: Optional[torch.dtype] = None,
233
+ max_sequence_length: int = 256,
234
+ num_hidden_layers_to_skip: int = 2,
235
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
236
+ device = device or self._execution_device
237
+ dtype = dtype or self.text_encoder.dtype
238
+
239
+ prompt = [prompt] if isinstance(prompt, str) else prompt
240
+ batch_size = len(prompt)
241
+
242
+ prompt = [prompt_template["template"].format(p) for p in prompt]
243
+
244
+ crop_start = prompt_template.get("crop_start", None)
245
+ if crop_start is None:
246
+ prompt_template_input = self.tokenizer(
247
+ prompt_template["template"],
248
+ padding="max_length",
249
+ return_tensors="pt",
250
+ return_length=False,
251
+ return_overflowing_tokens=False,
252
+ return_attention_mask=False,
253
+ )
254
+ crop_start = prompt_template_input["input_ids"].shape[-1]
255
+ # Remove <|eot_id|> token and placeholder {}
256
+ crop_start -= 2
257
+
258
+ max_sequence_length += crop_start
259
+ text_inputs = self.tokenizer(
260
+ prompt,
261
+ max_length=max_sequence_length,
262
+ padding="max_length",
263
+ truncation=True,
264
+ return_tensors="pt",
265
+ return_length=False,
266
+ return_overflowing_tokens=False,
267
+ return_attention_mask=True,
268
+ )
269
+ text_input_ids = text_inputs.input_ids.to(device=device)
270
+ prompt_attention_mask = text_inputs.attention_mask.to(device=device)
271
+
272
+ prompt_embeds = self.text_encoder(
273
+ input_ids=text_input_ids,
274
+ attention_mask=prompt_attention_mask,
275
+ output_hidden_states=True,
276
+ ).hidden_states[-(num_hidden_layers_to_skip + 1)]
277
+ prompt_embeds = prompt_embeds.to(dtype=dtype)
278
+
279
+ if crop_start is not None and crop_start > 0:
280
+ prompt_embeds = prompt_embeds[:, crop_start:]
281
+ prompt_attention_mask = prompt_attention_mask[:, crop_start:]
282
+
283
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
284
+ _, seq_len, _ = prompt_embeds.shape
285
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
286
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
287
+ prompt_attention_mask = prompt_attention_mask.repeat(1, num_videos_per_prompt)
288
+ prompt_attention_mask = prompt_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
289
+
290
+ return prompt_embeds, prompt_attention_mask
291
+
292
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline._get_clip_prompt_embeds
293
+ def _get_clip_prompt_embeds(
294
+ self,
295
+ prompt: Union[str, List[str]],
296
+ num_videos_per_prompt: int = 1,
297
+ device: Optional[torch.device] = None,
298
+ dtype: Optional[torch.dtype] = None,
299
+ max_sequence_length: int = 77,
300
+ ) -> torch.Tensor:
301
+ device = device or self._execution_device
302
+ dtype = dtype or self.text_encoder_2.dtype
303
+
304
+ prompt = [prompt] if isinstance(prompt, str) else prompt
305
+ batch_size = len(prompt)
306
+
307
+ text_inputs = self.tokenizer_2(
308
+ prompt,
309
+ padding="max_length",
310
+ max_length=max_sequence_length,
311
+ truncation=True,
312
+ return_tensors="pt",
313
+ )
314
+
315
+ text_input_ids = text_inputs.input_ids
316
+ untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
317
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
318
+ removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
319
+ logger.warning(
320
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
321
+ f" {max_sequence_length} tokens: {removed_text}"
322
+ )
323
+
324
+ prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False).pooler_output
325
+
326
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
327
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
328
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, -1)
329
+
330
+ return prompt_embeds
331
+
332
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline.encode_prompt
333
+ def encode_prompt(
334
+ self,
335
+ prompt: Union[str, List[str]],
336
+ prompt_2: Union[str, List[str]] = None,
337
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
338
+ num_videos_per_prompt: int = 1,
339
+ prompt_embeds: Optional[torch.Tensor] = None,
340
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
341
+ prompt_attention_mask: Optional[torch.Tensor] = None,
342
+ device: Optional[torch.device] = None,
343
+ dtype: Optional[torch.dtype] = None,
344
+ max_sequence_length: int = 256,
345
+ ):
346
+ if prompt_embeds is None:
347
+ prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
348
+ prompt,
349
+ prompt_template,
350
+ num_videos_per_prompt,
351
+ device=device,
352
+ dtype=dtype,
353
+ max_sequence_length=max_sequence_length,
354
+ )
355
+
356
+ if pooled_prompt_embeds is None:
357
+ if prompt_2 is None:
358
+ prompt_2 = prompt
359
+ pooled_prompt_embeds = self._get_clip_prompt_embeds(
360
+ prompt,
361
+ num_videos_per_prompt,
362
+ device=device,
363
+ dtype=dtype,
364
+ max_sequence_length=77,
365
+ )
366
+
367
+ return prompt_embeds, pooled_prompt_embeds, prompt_attention_mask
368
+
369
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline.check_inputs
370
+ def check_inputs(
371
+ self,
372
+ prompt,
373
+ prompt_2,
374
+ height,
375
+ width,
376
+ prompt_embeds=None,
377
+ callback_on_step_end_tensor_inputs=None,
378
+ prompt_template=None,
379
+ ):
380
+ if height % 16 != 0 or width % 16 != 0:
381
+ raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
382
+
383
+ if callback_on_step_end_tensor_inputs is not None and not all(
384
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
385
+ ):
386
+ raise ValueError(
387
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
388
+ )
389
+
390
+ if prompt is not None and prompt_embeds is not None:
391
+ raise ValueError(
392
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
393
+ " only forward one of the two."
394
+ )
395
+ elif prompt_2 is not None and prompt_embeds is not None:
396
+ raise ValueError(
397
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
398
+ " only forward one of the two."
399
+ )
400
+ elif prompt is None and prompt_embeds is None:
401
+ raise ValueError(
402
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
403
+ )
404
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
405
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
406
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
407
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
408
+
409
+ if prompt_template is not None:
410
+ if not isinstance(prompt_template, dict):
411
+ raise ValueError(f"`prompt_template` has to be of type `dict` but is {type(prompt_template)}")
412
+ if "template" not in prompt_template:
413
+ raise ValueError(
414
+ f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
415
+ )
416
+
417
+ def prepare_latents(
418
+ self,
419
+ image: torch.Tensor,
420
+ batch_size: int,
421
+ num_channels_latents: int = 32,
422
+ height: int = 544,
423
+ width: int = 960,
424
+ num_frames: int = 97,
425
+ dtype: Optional[torch.dtype] = None,
426
+ device: Optional[torch.device] = None,
427
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
428
+ latents: Optional[torch.Tensor] = None,
429
+ ) -> torch.Tensor:
430
+ if isinstance(generator, list) and len(generator) != batch_size:
431
+ raise ValueError(
432
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
433
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
434
+ )
435
+
436
+ image = image.unsqueeze(2) # [B, C, 1, H, W]
437
+ if isinstance(generator, list):
438
+ image_latents = [
439
+ retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
440
+ ]
441
+ else:
442
+ image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
443
+
444
+ image_latents = torch.cat(image_latents, dim=0).to(dtype) * self.vae_scaling_factor
445
+
446
+ num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
447
+ latent_height, latent_width = height // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial
448
+ shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
449
+ padding_shape = (batch_size, num_channels_latents, num_latent_frames - 1, latent_height, latent_width)
450
+
451
+ latents_padding = torch.zeros(padding_shape, dtype=dtype, device=device)
452
+ image_latents = torch.cat([image_latents, latents_padding], dim=2)
453
+
454
+ if latents is None:
455
+ latents = randn_tensor(shape, generator=generator, dtype=dtype, device=device)
456
+ else:
457
+ latents = latents.to(dtype=dtype, device=device)
458
+
459
+ return latents, image_latents
460
+
461
+ def enable_vae_slicing(self):
462
+ r"""
463
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
464
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
465
+ """
466
+ self.vae.enable_slicing()
467
+
468
+ def disable_vae_slicing(self):
469
+ r"""
470
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
471
+ computing decoding in one step.
472
+ """
473
+ self.vae.disable_slicing()
474
+
475
+ def enable_vae_tiling(self):
476
+ r"""
477
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
478
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
479
+ processing larger images.
480
+ """
481
+ self.vae.enable_tiling()
482
+
483
+ def disable_vae_tiling(self):
484
+ r"""
485
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
486
+ computing decoding in one step.
487
+ """
488
+ self.vae.disable_tiling()
489
+
490
+ @property
491
+ def guidance_scale(self):
492
+ return self._guidance_scale
493
+
494
+ @property
495
+ def num_timesteps(self):
496
+ return self._num_timesteps
497
+
498
+ @property
499
+ def attention_kwargs(self):
500
+ return self._attention_kwargs
501
+
502
+ @property
503
+ def current_timestep(self):
504
+ return self._current_timestep
505
+
506
+ @property
507
+ def interrupt(self):
508
+ return self._interrupt
509
+
510
+ @torch.no_grad()
511
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
512
+ def __call__(
513
+ self,
514
+ image: PipelineImageInput,
515
+ prompt: Union[str, List[str]] = None,
516
+ prompt_2: Union[str, List[str]] = None,
517
+ negative_prompt: Union[str, List[str]] = None,
518
+ negative_prompt_2: Union[str, List[str]] = None,
519
+ height: int = 544,
520
+ width: int = 960,
521
+ num_frames: int = 97,
522
+ num_inference_steps: int = 50,
523
+ sigmas: List[float] = None,
524
+ true_cfg_scale: float = 6.0,
525
+ guidance_scale: float = 1.0,
526
+ num_videos_per_prompt: Optional[int] = 1,
527
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
528
+ latents: Optional[torch.Tensor] = None,
529
+ prompt_embeds: Optional[torch.Tensor] = None,
530
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
531
+ prompt_attention_mask: Optional[torch.Tensor] = None,
532
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
533
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
534
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
535
+ output_type: Optional[str] = "pil",
536
+ return_dict: bool = True,
537
+ attention_kwargs: Optional[Dict[str, Any]] = None,
538
+ callback_on_step_end: Optional[
539
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
540
+ ] = None,
541
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
542
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
543
+ max_sequence_length: int = 256,
544
+ ):
545
+ r"""
546
+ The call function to the pipeline for generation.
547
+
548
+ Args:
549
+ prompt (`str` or `List[str]`, *optional*):
550
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
551
+ instead.
552
+ prompt_2 (`str` or `List[str]`, *optional*):
553
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
554
+ will be used instead.
555
+ negative_prompt (`str` or `List[str]`, *optional*):
556
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
557
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
558
+ not greater than `1`).
559
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
560
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
561
+ `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
562
+ height (`int`, defaults to `720`):
563
+ The height in pixels of the generated image.
564
+ width (`int`, defaults to `1280`):
565
+ The width in pixels of the generated image.
566
+ num_frames (`int`, defaults to `129`):
567
+ The number of frames in the generated video.
568
+ num_inference_steps (`int`, defaults to `50`):
569
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
570
+ expense of slower inference.
571
+ sigmas (`List[float]`, *optional*):
572
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
573
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
574
+ will be used.
575
+ true_cfg_scale (`float`, *optional*, defaults to 1.0):
576
+ When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
577
+ guidance_scale (`float`, defaults to `6.0`):
578
+ Guidance scale as defined in [Classifier-Free Diffusion
579
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
580
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
581
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
582
+ the text `prompt`, usually at the expense of lower image quality. Note that the only available
583
+ HunyuanVideo model is CFG-distilled, which means that traditional guidance between unconditional and
584
+ conditional latent is not applied.
585
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
586
+ The number of images to generate per prompt.
587
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
588
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
589
+ generation deterministic.
590
+ latents (`torch.Tensor`, *optional*):
591
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
592
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
593
+ tensor is generated by sampling using the supplied random `generator`.
594
+ prompt_embeds (`torch.Tensor`, *optional*):
595
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
596
+ provided, text embeddings are generated from the `prompt` input argument.
597
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
598
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
599
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
600
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
601
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
602
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
603
+ argument.
604
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
605
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
606
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
607
+ input argument.
608
+ output_type (`str`, *optional*, defaults to `"pil"`):
609
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
610
+ return_dict (`bool`, *optional*, defaults to `True`):
611
+ Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a plain tuple.
612
+ attention_kwargs (`dict`, *optional*):
613
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
614
+ `self.processor` in
615
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
616
+ clip_skip (`int`, *optional*):
617
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
618
+ the output of the pre-final layer will be used for computing the prompt embeddings.
619
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
620
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
621
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
622
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
623
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
624
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
625
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
626
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
627
+ `._callback_tensor_inputs` attribute of your pipeline class.
628
+
629
+ Examples:
630
+
631
+ Returns:
632
+ [`~HunyuanVideoPipelineOutput`] or `tuple`:
633
+ If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned, otherwise a `tuple` is returned
634
+ where the first element is a list with the generated images and the second element is a list of `bool`s
635
+ indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
636
+ """
637
+
638
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
639
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
640
+
641
+ # 1. Check inputs. Raise error if not correct
642
+ self.check_inputs(
643
+ prompt,
644
+ prompt_2,
645
+ height,
646
+ width,
647
+ prompt_embeds,
648
+ callback_on_step_end_tensor_inputs,
649
+ prompt_template,
650
+ )
651
+
652
+ has_neg_prompt = negative_prompt is not None or (
653
+ negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
654
+ )
655
+ do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
656
+
657
+ self._guidance_scale = guidance_scale
658
+ self._attention_kwargs = attention_kwargs
659
+ self._current_timestep = None
660
+ self._interrupt = False
661
+
662
+ device = self._execution_device
663
+
664
+ # 2. Define call parameters
665
+ if prompt is not None and isinstance(prompt, str):
666
+ batch_size = 1
667
+ elif prompt is not None and isinstance(prompt, list):
668
+ batch_size = len(prompt)
669
+ else:
670
+ batch_size = prompt_embeds.shape[0]
671
+
672
+ # 3. Encode input prompt
673
+ transformer_dtype = self.transformer.dtype
674
+ prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
675
+ prompt=prompt,
676
+ prompt_2=prompt_2,
677
+ prompt_template=prompt_template,
678
+ num_videos_per_prompt=num_videos_per_prompt,
679
+ prompt_embeds=prompt_embeds,
680
+ pooled_prompt_embeds=pooled_prompt_embeds,
681
+ prompt_attention_mask=prompt_attention_mask,
682
+ device=device,
683
+ max_sequence_length=max_sequence_length,
684
+ )
685
+ prompt_embeds = prompt_embeds.to(transformer_dtype)
686
+ prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
687
+ pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
688
+
689
+ if do_true_cfg:
690
+ negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
691
+ prompt=negative_prompt,
692
+ prompt_2=negative_prompt_2,
693
+ prompt_template=prompt_template,
694
+ num_videos_per_prompt=num_videos_per_prompt,
695
+ prompt_embeds=negative_prompt_embeds,
696
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
697
+ prompt_attention_mask=negative_prompt_attention_mask,
698
+ device=device,
699
+ max_sequence_length=max_sequence_length,
700
+ )
701
+ negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
702
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(transformer_dtype)
703
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(transformer_dtype)
704
+
705
+ # 4. Prepare timesteps
706
+ sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
707
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
708
+
709
+ # 5. Prepare latent variables
710
+ vae_dtype = self.vae.dtype
711
+ image = self.video_processor.preprocess(image, height=height, width=width).to(device, vae_dtype)
712
+ num_channels_latents = self.transformer.config.in_channels // 2
713
+ latents, image_latents = self.prepare_latents(
714
+ image,
715
+ batch_size * num_videos_per_prompt,
716
+ num_channels_latents,
717
+ height,
718
+ width,
719
+ num_frames,
720
+ torch.float32,
721
+ device,
722
+ generator,
723
+ latents,
724
+ )
725
+ latent_image_input = image_latents.to(transformer_dtype)
726
+
727
+ # 6. Prepare guidance condition
728
+ guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
729
+
730
+ # 7. Denoising loop
731
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
732
+ self._num_timesteps = len(timesteps)
733
+
734
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
735
+ for i, t in enumerate(timesteps):
736
+ if self.interrupt:
737
+ continue
738
+
739
+ self._current_timestep = t
740
+ latent_model_input = latents.to(transformer_dtype)
741
+ latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=1)
742
+
743
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
744
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
745
+
746
+ noise_pred = self.transformer(
747
+ hidden_states=latent_model_input,
748
+ timestep=timestep,
749
+ encoder_hidden_states=prompt_embeds,
750
+ encoder_attention_mask=prompt_attention_mask,
751
+ pooled_projections=pooled_prompt_embeds,
752
+ guidance=guidance,
753
+ attention_kwargs=attention_kwargs,
754
+ return_dict=False,
755
+ )[0]
756
+
757
+ if do_true_cfg:
758
+ neg_noise_pred = self.transformer(
759
+ hidden_states=latent_model_input,
760
+ timestep=timestep,
761
+ encoder_hidden_states=negative_prompt_embeds,
762
+ encoder_attention_mask=negative_prompt_attention_mask,
763
+ pooled_projections=negative_pooled_prompt_embeds,
764
+ guidance=guidance,
765
+ attention_kwargs=attention_kwargs,
766
+ return_dict=False,
767
+ )[0]
768
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
769
+
770
+ # compute the previous noisy sample x_t -> x_t-1
771
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
772
+
773
+ if callback_on_step_end is not None:
774
+ callback_kwargs = {}
775
+ for k in callback_on_step_end_tensor_inputs:
776
+ callback_kwargs[k] = locals()[k]
777
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
778
+
779
+ latents = callback_outputs.pop("latents", latents)
780
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
781
+
782
+ # call the callback, if provided
783
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
784
+ progress_bar.update()
785
+
786
+ if XLA_AVAILABLE:
787
+ xm.mark_step()
788
+
789
+ self._current_timestep = None
790
+
791
+ if not output_type == "latent":
792
+ latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
793
+ video = self.vae.decode(latents, return_dict=False)[0]
794
+ video = self.video_processor.postprocess_video(video, output_type=output_type)
795
+ else:
796
+ video = latents
797
+
798
+ # Offload all models
799
+ self.maybe_free_model_hooks()
800
+
801
+ if not return_dict:
802
+ return (video,)
803
+
804
+ return HunyuanVideoPipelineOutput(frames=video)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py ADDED
@@ -0,0 +1,755 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ from transformers import CLIPTextModel, CLIPTokenizer, LlamaModel, LlamaTokenizerFast
21
+
22
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
23
+ from ...loaders import HunyuanVideoLoraLoaderMixin
24
+ from ...models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel
25
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
26
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
27
+ from ...utils.torch_utils import randn_tensor
28
+ from ...video_processor import VideoProcessor
29
+ from ..pipeline_utils import DiffusionPipeline
30
+ from .pipeline_output import HunyuanVideoPipelineOutput
31
+
32
+
33
+ if is_torch_xla_available():
34
+ import torch_xla.core.xla_model as xm
35
+
36
+ XLA_AVAILABLE = True
37
+ else:
38
+ XLA_AVAILABLE = False
39
+
40
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
41
+
42
+
43
+ EXAMPLE_DOC_STRING = """
44
+ Examples:
45
+ ```python
46
+ >>> import torch
47
+ >>> from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
48
+ >>> from diffusers.utils import export_to_video
49
+
50
+ >>> model_id = "hunyuanvideo-community/HunyuanVideo"
51
+ >>> transformer = HunyuanVideoTransformer3DModel.from_pretrained(
52
+ ... model_id, subfolder="transformer", torch_dtype=torch.bfloat16
53
+ ... )
54
+ >>> pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.float16)
55
+ >>> pipe.vae.enable_tiling()
56
+ >>> pipe.to("cuda")
57
+
58
+ >>> output = pipe(
59
+ ... prompt="A cat walks on the grass, realistic",
60
+ ... height=320,
61
+ ... width=512,
62
+ ... num_frames=61,
63
+ ... num_inference_steps=30,
64
+ ... ).frames[0]
65
+ >>> export_to_video(output, "output.mp4", fps=15)
66
+ ```
67
+ """
68
+
69
+
70
+ DEFAULT_PROMPT_TEMPLATE = {
71
+ "template": (
72
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
73
+ "1. The main content and theme of the video."
74
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
75
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
76
+ "4. background environment, light, style and atmosphere."
77
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
78
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
79
+ ),
80
+ "crop_start": 95,
81
+ }
82
+
83
+
84
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
85
+ def retrieve_timesteps(
86
+ scheduler,
87
+ num_inference_steps: Optional[int] = None,
88
+ device: Optional[Union[str, torch.device]] = None,
89
+ timesteps: Optional[List[int]] = None,
90
+ sigmas: Optional[List[float]] = None,
91
+ **kwargs,
92
+ ):
93
+ r"""
94
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
95
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
96
+
97
+ Args:
98
+ scheduler (`SchedulerMixin`):
99
+ The scheduler to get timesteps from.
100
+ num_inference_steps (`int`):
101
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
102
+ must be `None`.
103
+ device (`str` or `torch.device`, *optional*):
104
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
105
+ timesteps (`List[int]`, *optional*):
106
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
107
+ `num_inference_steps` and `sigmas` must be `None`.
108
+ sigmas (`List[float]`, *optional*):
109
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
110
+ `num_inference_steps` and `timesteps` must be `None`.
111
+
112
+ Returns:
113
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
114
+ second element is the number of inference steps.
115
+ """
116
+ if timesteps is not None and sigmas is not None:
117
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
118
+ if timesteps is not None:
119
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
120
+ if not accepts_timesteps:
121
+ raise ValueError(
122
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
123
+ f" timestep schedules. Please check whether you are using the correct scheduler."
124
+ )
125
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
126
+ timesteps = scheduler.timesteps
127
+ num_inference_steps = len(timesteps)
128
+ elif sigmas is not None:
129
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
130
+ if not accept_sigmas:
131
+ raise ValueError(
132
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
133
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
134
+ )
135
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
136
+ timesteps = scheduler.timesteps
137
+ num_inference_steps = len(timesteps)
138
+ else:
139
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
140
+ timesteps = scheduler.timesteps
141
+ return timesteps, num_inference_steps
142
+
143
+
144
+ class HunyuanVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
145
+ r"""
146
+ Pipeline for text-to-video generation using HunyuanVideo.
147
+
148
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
149
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
150
+
151
+ Args:
152
+ text_encoder ([`LlamaModel`]):
153
+ [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
154
+ tokenizer (`LlamaTokenizer`):
155
+ Tokenizer from [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
156
+ transformer ([`HunyuanVideoTransformer3DModel`]):
157
+ Conditional Transformer to denoise the encoded image latents.
158
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
159
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
160
+ vae ([`AutoencoderKLHunyuanVideo`]):
161
+ Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
162
+ text_encoder_2 ([`CLIPTextModel`]):
163
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
164
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
165
+ tokenizer_2 (`CLIPTokenizer`):
166
+ Tokenizer of class
167
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
168
+ """
169
+
170
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
171
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
172
+
173
+ def __init__(
174
+ self,
175
+ text_encoder: LlamaModel,
176
+ tokenizer: LlamaTokenizerFast,
177
+ transformer: HunyuanVideoTransformer3DModel,
178
+ vae: AutoencoderKLHunyuanVideo,
179
+ scheduler: FlowMatchEulerDiscreteScheduler,
180
+ text_encoder_2: CLIPTextModel,
181
+ tokenizer_2: CLIPTokenizer,
182
+ ):
183
+ super().__init__()
184
+
185
+ self.register_modules(
186
+ vae=vae,
187
+ text_encoder=text_encoder,
188
+ tokenizer=tokenizer,
189
+ transformer=transformer,
190
+ scheduler=scheduler,
191
+ text_encoder_2=text_encoder_2,
192
+ tokenizer_2=tokenizer_2,
193
+ )
194
+
195
+ self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
196
+ self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 8
197
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
198
+
199
+ def _get_llama_prompt_embeds(
200
+ self,
201
+ prompt: Union[str, List[str]],
202
+ prompt_template: Dict[str, Any],
203
+ num_videos_per_prompt: int = 1,
204
+ device: Optional[torch.device] = None,
205
+ dtype: Optional[torch.dtype] = None,
206
+ max_sequence_length: int = 256,
207
+ num_hidden_layers_to_skip: int = 2,
208
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
209
+ device = device or self._execution_device
210
+ dtype = dtype or self.text_encoder.dtype
211
+
212
+ prompt = [prompt] if isinstance(prompt, str) else prompt
213
+ batch_size = len(prompt)
214
+
215
+ prompt = [prompt_template["template"].format(p) for p in prompt]
216
+
217
+ crop_start = prompt_template.get("crop_start", None)
218
+ if crop_start is None:
219
+ prompt_template_input = self.tokenizer(
220
+ prompt_template["template"],
221
+ padding="max_length",
222
+ return_tensors="pt",
223
+ return_length=False,
224
+ return_overflowing_tokens=False,
225
+ return_attention_mask=False,
226
+ )
227
+ crop_start = prompt_template_input["input_ids"].shape[-1]
228
+ # Remove <|eot_id|> token and placeholder {}
229
+ crop_start -= 2
230
+
231
+ max_sequence_length += crop_start
232
+ text_inputs = self.tokenizer(
233
+ prompt,
234
+ max_length=max_sequence_length,
235
+ padding="max_length",
236
+ truncation=True,
237
+ return_tensors="pt",
238
+ return_length=False,
239
+ return_overflowing_tokens=False,
240
+ return_attention_mask=True,
241
+ )
242
+ text_input_ids = text_inputs.input_ids.to(device=device)
243
+ prompt_attention_mask = text_inputs.attention_mask.to(device=device)
244
+
245
+ prompt_embeds = self.text_encoder(
246
+ input_ids=text_input_ids,
247
+ attention_mask=prompt_attention_mask,
248
+ output_hidden_states=True,
249
+ ).hidden_states[-(num_hidden_layers_to_skip + 1)]
250
+ prompt_embeds = prompt_embeds.to(dtype=dtype)
251
+
252
+ if crop_start is not None and crop_start > 0:
253
+ prompt_embeds = prompt_embeds[:, crop_start:]
254
+ prompt_attention_mask = prompt_attention_mask[:, crop_start:]
255
+
256
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
257
+ _, seq_len, _ = prompt_embeds.shape
258
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
259
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
260
+ prompt_attention_mask = prompt_attention_mask.repeat(1, num_videos_per_prompt)
261
+ prompt_attention_mask = prompt_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
262
+
263
+ return prompt_embeds, prompt_attention_mask
264
+
265
+ def _get_clip_prompt_embeds(
266
+ self,
267
+ prompt: Union[str, List[str]],
268
+ num_videos_per_prompt: int = 1,
269
+ device: Optional[torch.device] = None,
270
+ dtype: Optional[torch.dtype] = None,
271
+ max_sequence_length: int = 77,
272
+ ) -> torch.Tensor:
273
+ device = device or self._execution_device
274
+ dtype = dtype or self.text_encoder_2.dtype
275
+
276
+ prompt = [prompt] if isinstance(prompt, str) else prompt
277
+ batch_size = len(prompt)
278
+
279
+ text_inputs = self.tokenizer_2(
280
+ prompt,
281
+ padding="max_length",
282
+ max_length=max_sequence_length,
283
+ truncation=True,
284
+ return_tensors="pt",
285
+ )
286
+
287
+ text_input_ids = text_inputs.input_ids
288
+ untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
289
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
290
+ removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
291
+ logger.warning(
292
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
293
+ f" {max_sequence_length} tokens: {removed_text}"
294
+ )
295
+
296
+ prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False).pooler_output
297
+
298
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
299
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
300
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, -1)
301
+
302
+ return prompt_embeds
303
+
304
+ def encode_prompt(
305
+ self,
306
+ prompt: Union[str, List[str]],
307
+ prompt_2: Union[str, List[str]] = None,
308
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
309
+ num_videos_per_prompt: int = 1,
310
+ prompt_embeds: Optional[torch.Tensor] = None,
311
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
312
+ prompt_attention_mask: Optional[torch.Tensor] = None,
313
+ device: Optional[torch.device] = None,
314
+ dtype: Optional[torch.dtype] = None,
315
+ max_sequence_length: int = 256,
316
+ ):
317
+ if prompt_embeds is None:
318
+ prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
319
+ prompt,
320
+ prompt_template,
321
+ num_videos_per_prompt,
322
+ device=device,
323
+ dtype=dtype,
324
+ max_sequence_length=max_sequence_length,
325
+ )
326
+
327
+ if pooled_prompt_embeds is None:
328
+ if prompt_2 is None:
329
+ prompt_2 = prompt
330
+ pooled_prompt_embeds = self._get_clip_prompt_embeds(
331
+ prompt,
332
+ num_videos_per_prompt,
333
+ device=device,
334
+ dtype=dtype,
335
+ max_sequence_length=77,
336
+ )
337
+
338
+ return prompt_embeds, pooled_prompt_embeds, prompt_attention_mask
339
+
340
+ def check_inputs(
341
+ self,
342
+ prompt,
343
+ prompt_2,
344
+ height,
345
+ width,
346
+ prompt_embeds=None,
347
+ callback_on_step_end_tensor_inputs=None,
348
+ prompt_template=None,
349
+ ):
350
+ if height % 16 != 0 or width % 16 != 0:
351
+ raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
352
+
353
+ if callback_on_step_end_tensor_inputs is not None and not all(
354
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
355
+ ):
356
+ raise ValueError(
357
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
358
+ )
359
+
360
+ if prompt is not None and prompt_embeds is not None:
361
+ raise ValueError(
362
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
363
+ " only forward one of the two."
364
+ )
365
+ elif prompt_2 is not None and prompt_embeds is not None:
366
+ raise ValueError(
367
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
368
+ " only forward one of the two."
369
+ )
370
+ elif prompt is None and prompt_embeds is None:
371
+ raise ValueError(
372
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
373
+ )
374
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
375
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
376
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
377
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
378
+
379
+ if prompt_template is not None:
380
+ if not isinstance(prompt_template, dict):
381
+ raise ValueError(f"`prompt_template` has to be of type `dict` but is {type(prompt_template)}")
382
+ if "template" not in prompt_template:
383
+ raise ValueError(
384
+ f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
385
+ )
386
+
387
+ def prepare_latents(
388
+ self,
389
+ batch_size: int,
390
+ num_channels_latents: int = 32,
391
+ height: int = 720,
392
+ width: int = 1280,
393
+ num_frames: int = 129,
394
+ dtype: Optional[torch.dtype] = None,
395
+ device: Optional[torch.device] = None,
396
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
397
+ latents: Optional[torch.Tensor] = None,
398
+ ) -> torch.Tensor:
399
+ if latents is not None:
400
+ return latents.to(device=device, dtype=dtype)
401
+
402
+ shape = (
403
+ batch_size,
404
+ num_channels_latents,
405
+ (num_frames - 1) // self.vae_scale_factor_temporal + 1,
406
+ int(height) // self.vae_scale_factor_spatial,
407
+ int(width) // self.vae_scale_factor_spatial,
408
+ )
409
+ if isinstance(generator, list) and len(generator) != batch_size:
410
+ raise ValueError(
411
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
412
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
413
+ )
414
+
415
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
416
+ return latents
417
+
418
+ def enable_vae_slicing(self):
419
+ r"""
420
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
421
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
422
+ """
423
+ self.vae.enable_slicing()
424
+
425
+ def disable_vae_slicing(self):
426
+ r"""
427
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
428
+ computing decoding in one step.
429
+ """
430
+ self.vae.disable_slicing()
431
+
432
+ def enable_vae_tiling(self):
433
+ r"""
434
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
435
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
436
+ processing larger images.
437
+ """
438
+ self.vae.enable_tiling()
439
+
440
+ def disable_vae_tiling(self):
441
+ r"""
442
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
443
+ computing decoding in one step.
444
+ """
445
+ self.vae.disable_tiling()
446
+
447
+ @property
448
+ def guidance_scale(self):
449
+ return self._guidance_scale
450
+
451
+ @property
452
+ def num_timesteps(self):
453
+ return self._num_timesteps
454
+
455
+ @property
456
+ def attention_kwargs(self):
457
+ return self._attention_kwargs
458
+
459
+ @property
460
+ def current_timestep(self):
461
+ return self._current_timestep
462
+
463
+ @property
464
+ def interrupt(self):
465
+ return self._interrupt
466
+
467
+ @torch.no_grad()
468
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
469
+ def __call__(
470
+ self,
471
+ prompt: Union[str, List[str]] = None,
472
+ prompt_2: Union[str, List[str]] = None,
473
+ negative_prompt: Union[str, List[str]] = None,
474
+ negative_prompt_2: Union[str, List[str]] = None,
475
+ height: int = 720,
476
+ width: int = 1280,
477
+ num_frames: int = 129,
478
+ num_inference_steps: int = 50,
479
+ sigmas: List[float] = None,
480
+ true_cfg_scale: float = 1.0,
481
+ guidance_scale: float = 6.0,
482
+ num_videos_per_prompt: Optional[int] = 1,
483
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
484
+ latents: Optional[torch.Tensor] = None,
485
+ prompt_embeds: Optional[torch.Tensor] = None,
486
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
487
+ prompt_attention_mask: Optional[torch.Tensor] = None,
488
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
489
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
490
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
491
+ output_type: Optional[str] = "pil",
492
+ return_dict: bool = True,
493
+ attention_kwargs: Optional[Dict[str, Any]] = None,
494
+ callback_on_step_end: Optional[
495
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
496
+ ] = None,
497
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
498
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
499
+ max_sequence_length: int = 256,
500
+ ):
501
+ r"""
502
+ The call function to the pipeline for generation.
503
+
504
+ Args:
505
+ prompt (`str` or `List[str]`, *optional*):
506
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
507
+ instead.
508
+ prompt_2 (`str` or `List[str]`, *optional*):
509
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
510
+ will be used instead.
511
+ negative_prompt (`str` or `List[str]`, *optional*):
512
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
513
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
514
+ not greater than `1`).
515
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
516
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
517
+ `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
518
+ height (`int`, defaults to `720`):
519
+ The height in pixels of the generated image.
520
+ width (`int`, defaults to `1280`):
521
+ The width in pixels of the generated image.
522
+ num_frames (`int`, defaults to `129`):
523
+ The number of frames in the generated video.
524
+ num_inference_steps (`int`, defaults to `50`):
525
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
526
+ expense of slower inference.
527
+ sigmas (`List[float]`, *optional*):
528
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
529
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
530
+ will be used.
531
+ true_cfg_scale (`float`, *optional*, defaults to 1.0):
532
+ True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and
533
+ `negative_prompt` is provided.
534
+ guidance_scale (`float`, defaults to `6.0`):
535
+ Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
536
+ a model to generate images more aligned with `prompt` at the expense of lower image quality.
537
+
538
+ Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to
539
+ the [paper](https://huggingface.co/papers/2210.03142) to learn more.
540
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
541
+ The number of images to generate per prompt.
542
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
543
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
544
+ generation deterministic.
545
+ latents (`torch.Tensor`, *optional*):
546
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
547
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
548
+ tensor is generated by sampling using the supplied random `generator`.
549
+ prompt_embeds (`torch.Tensor`, *optional*):
550
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
551
+ provided, text embeddings are generated from the `prompt` input argument.
552
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
553
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
554
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
555
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
556
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
557
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
558
+ argument.
559
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
560
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
561
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
562
+ input argument.
563
+ output_type (`str`, *optional*, defaults to `"pil"`):
564
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
565
+ return_dict (`bool`, *optional*, defaults to `True`):
566
+ Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a plain tuple.
567
+ attention_kwargs (`dict`, *optional*):
568
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
569
+ `self.processor` in
570
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
571
+ clip_skip (`int`, *optional*):
572
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
573
+ the output of the pre-final layer will be used for computing the prompt embeddings.
574
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
575
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
576
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
577
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
578
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
579
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
580
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
581
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
582
+ `._callback_tensor_inputs` attribute of your pipeline class.
583
+
584
+ Examples:
585
+
586
+ Returns:
587
+ [`~HunyuanVideoPipelineOutput`] or `tuple`:
588
+ If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned, otherwise a `tuple` is returned
589
+ where the first element is a list with the generated images and the second element is a list of `bool`s
590
+ indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
591
+ """
592
+
593
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
594
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
595
+
596
+ # 1. Check inputs. Raise error if not correct
597
+ self.check_inputs(
598
+ prompt,
599
+ prompt_2,
600
+ height,
601
+ width,
602
+ prompt_embeds,
603
+ callback_on_step_end_tensor_inputs,
604
+ prompt_template,
605
+ )
606
+
607
+ has_neg_prompt = negative_prompt is not None or (
608
+ negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
609
+ )
610
+ do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
611
+
612
+ self._guidance_scale = guidance_scale
613
+ self._attention_kwargs = attention_kwargs
614
+ self._current_timestep = None
615
+ self._interrupt = False
616
+
617
+ device = self._execution_device
618
+
619
+ # 2. Define call parameters
620
+ if prompt is not None and isinstance(prompt, str):
621
+ batch_size = 1
622
+ elif prompt is not None and isinstance(prompt, list):
623
+ batch_size = len(prompt)
624
+ else:
625
+ batch_size = prompt_embeds.shape[0]
626
+
627
+ # 3. Encode input prompt
628
+ transformer_dtype = self.transformer.dtype
629
+ prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
630
+ prompt=prompt,
631
+ prompt_2=prompt_2,
632
+ prompt_template=prompt_template,
633
+ num_videos_per_prompt=num_videos_per_prompt,
634
+ prompt_embeds=prompt_embeds,
635
+ pooled_prompt_embeds=pooled_prompt_embeds,
636
+ prompt_attention_mask=prompt_attention_mask,
637
+ device=device,
638
+ max_sequence_length=max_sequence_length,
639
+ )
640
+ prompt_embeds = prompt_embeds.to(transformer_dtype)
641
+ prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
642
+ pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
643
+
644
+ if do_true_cfg:
645
+ negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
646
+ prompt=negative_prompt,
647
+ prompt_2=negative_prompt_2,
648
+ prompt_template=prompt_template,
649
+ num_videos_per_prompt=num_videos_per_prompt,
650
+ prompt_embeds=negative_prompt_embeds,
651
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
652
+ prompt_attention_mask=negative_prompt_attention_mask,
653
+ device=device,
654
+ max_sequence_length=max_sequence_length,
655
+ )
656
+ negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
657
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(transformer_dtype)
658
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(transformer_dtype)
659
+
660
+ # 4. Prepare timesteps
661
+ sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
662
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
663
+
664
+ # 5. Prepare latent variables
665
+ num_channels_latents = self.transformer.config.in_channels
666
+ latents = self.prepare_latents(
667
+ batch_size * num_videos_per_prompt,
668
+ num_channels_latents,
669
+ height,
670
+ width,
671
+ num_frames,
672
+ torch.float32,
673
+ device,
674
+ generator,
675
+ latents,
676
+ )
677
+
678
+ # 6. Prepare guidance condition
679
+ guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
680
+
681
+ # 7. Denoising loop
682
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
683
+ self._num_timesteps = len(timesteps)
684
+
685
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
686
+ for i, t in enumerate(timesteps):
687
+ if self.interrupt:
688
+ continue
689
+
690
+ self._current_timestep = t
691
+ latent_model_input = latents.to(transformer_dtype)
692
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
693
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
694
+
695
+ with self.transformer.cache_context("cond"):
696
+ noise_pred = self.transformer(
697
+ hidden_states=latent_model_input,
698
+ timestep=timestep,
699
+ encoder_hidden_states=prompt_embeds,
700
+ encoder_attention_mask=prompt_attention_mask,
701
+ pooled_projections=pooled_prompt_embeds,
702
+ guidance=guidance,
703
+ attention_kwargs=attention_kwargs,
704
+ return_dict=False,
705
+ )[0]
706
+
707
+ if do_true_cfg:
708
+ with self.transformer.cache_context("uncond"):
709
+ neg_noise_pred = self.transformer(
710
+ hidden_states=latent_model_input,
711
+ timestep=timestep,
712
+ encoder_hidden_states=negative_prompt_embeds,
713
+ encoder_attention_mask=negative_prompt_attention_mask,
714
+ pooled_projections=negative_pooled_prompt_embeds,
715
+ guidance=guidance,
716
+ attention_kwargs=attention_kwargs,
717
+ return_dict=False,
718
+ )[0]
719
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
720
+
721
+ # compute the previous noisy sample x_t -> x_t-1
722
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
723
+
724
+ if callback_on_step_end is not None:
725
+ callback_kwargs = {}
726
+ for k in callback_on_step_end_tensor_inputs:
727
+ callback_kwargs[k] = locals()[k]
728
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
729
+
730
+ latents = callback_outputs.pop("latents", latents)
731
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
732
+
733
+ # call the callback, if provided
734
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
735
+ progress_bar.update()
736
+
737
+ if XLA_AVAILABLE:
738
+ xm.mark_step()
739
+
740
+ self._current_timestep = None
741
+
742
+ if not output_type == "latent":
743
+ latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
744
+ video = self.vae.decode(latents, return_dict=False)[0]
745
+ video = self.video_processor.postprocess_video(video, output_type=output_type)
746
+ else:
747
+ video = latents
748
+
749
+ # Offload all models
750
+ self.maybe_free_model_hooks()
751
+
752
+ if not return_dict:
753
+ return (video,)
754
+
755
+ return HunyuanVideoPipelineOutput(frames=video)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The Framepack Team, The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import math
17
+ from enum import Enum
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
+
20
+ import numpy as np
21
+ import torch
22
+ from transformers import (
23
+ CLIPTextModel,
24
+ CLIPTokenizer,
25
+ LlamaModel,
26
+ LlamaTokenizerFast,
27
+ SiglipImageProcessor,
28
+ SiglipVisionModel,
29
+ )
30
+
31
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
32
+ from ...image_processor import PipelineImageInput
33
+ from ...loaders import HunyuanVideoLoraLoaderMixin
34
+ from ...models import AutoencoderKLHunyuanVideo, HunyuanVideoFramepackTransformer3DModel
35
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
36
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
37
+ from ...utils.torch_utils import randn_tensor
38
+ from ...video_processor import VideoProcessor
39
+ from ..pipeline_utils import DiffusionPipeline
40
+ from .pipeline_output import HunyuanVideoFramepackPipelineOutput
41
+
42
+
43
+ if is_torch_xla_available():
44
+ import torch_xla.core.xla_model as xm
45
+
46
+ XLA_AVAILABLE = True
47
+ else:
48
+ XLA_AVAILABLE = False
49
+
50
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
51
+
52
+
53
+ # TODO(yiyi): We can pack the checkpoints nicely with modular loader
54
+ EXAMPLE_DOC_STRING = """
55
+ Examples:
56
+ ##### Image-to-Video
57
+
58
+ ```python
59
+ >>> import torch
60
+ >>> from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
61
+ >>> from diffusers.utils import export_to_video, load_image
62
+ >>> from transformers import SiglipImageProcessor, SiglipVisionModel
63
+
64
+ >>> transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
65
+ ... "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
66
+ ... )
67
+ >>> feature_extractor = SiglipImageProcessor.from_pretrained(
68
+ ... "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
69
+ ... )
70
+ >>> image_encoder = SiglipVisionModel.from_pretrained(
71
+ ... "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
72
+ ... )
73
+ >>> pipe = HunyuanVideoFramepackPipeline.from_pretrained(
74
+ ... "hunyuanvideo-community/HunyuanVideo",
75
+ ... transformer=transformer,
76
+ ... feature_extractor=feature_extractor,
77
+ ... image_encoder=image_encoder,
78
+ ... torch_dtype=torch.float16,
79
+ ... )
80
+ >>> pipe.vae.enable_tiling()
81
+ >>> pipe.to("cuda")
82
+
83
+ >>> image = load_image(
84
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
85
+ ... )
86
+ >>> output = pipe(
87
+ ... image=image,
88
+ ... prompt="A penguin dancing in the snow",
89
+ ... height=832,
90
+ ... width=480,
91
+ ... num_frames=91,
92
+ ... num_inference_steps=30,
93
+ ... guidance_scale=9.0,
94
+ ... generator=torch.Generator().manual_seed(0),
95
+ ... sampling_type="inverted_anti_drifting",
96
+ ... ).frames[0]
97
+ >>> export_to_video(output, "output.mp4", fps=30)
98
+ ```
99
+
100
+ ##### First and Last Image-to-Video
101
+
102
+ ```python
103
+ >>> import torch
104
+ >>> from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
105
+ >>> from diffusers.utils import export_to_video, load_image
106
+ >>> from transformers import SiglipImageProcessor, SiglipVisionModel
107
+
108
+ >>> transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
109
+ ... "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
110
+ ... )
111
+ >>> feature_extractor = SiglipImageProcessor.from_pretrained(
112
+ ... "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
113
+ ... )
114
+ >>> image_encoder = SiglipVisionModel.from_pretrained(
115
+ ... "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
116
+ ... )
117
+ >>> pipe = HunyuanVideoFramepackPipeline.from_pretrained(
118
+ ... "hunyuanvideo-community/HunyuanVideo",
119
+ ... transformer=transformer,
120
+ ... feature_extractor=feature_extractor,
121
+ ... image_encoder=image_encoder,
122
+ ... torch_dtype=torch.float16,
123
+ ... )
124
+ >>> pipe.to("cuda")
125
+
126
+ >>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
127
+ >>> first_image = load_image(
128
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
129
+ ... )
130
+ >>> last_image = load_image(
131
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
132
+ ... )
133
+ >>> output = pipe(
134
+ ... image=first_image,
135
+ ... last_image=last_image,
136
+ ... prompt=prompt,
137
+ ... height=512,
138
+ ... width=512,
139
+ ... num_frames=91,
140
+ ... num_inference_steps=30,
141
+ ... guidance_scale=9.0,
142
+ ... generator=torch.Generator().manual_seed(0),
143
+ ... sampling_type="inverted_anti_drifting",
144
+ ... ).frames[0]
145
+ >>> export_to_video(output, "output.mp4", fps=30)
146
+ ```
147
+ """
148
+
149
+
150
+ DEFAULT_PROMPT_TEMPLATE = {
151
+ "template": (
152
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
153
+ "1. The main content and theme of the video."
154
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
155
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
156
+ "4. background environment, light, style and atmosphere."
157
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
158
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
159
+ ),
160
+ "crop_start": 95,
161
+ }
162
+
163
+
164
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
165
+ def calculate_shift(
166
+ image_seq_len,
167
+ base_seq_len: int = 256,
168
+ max_seq_len: int = 4096,
169
+ base_shift: float = 0.5,
170
+ max_shift: float = 1.15,
171
+ ):
172
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
173
+ b = base_shift - m * base_seq_len
174
+ mu = image_seq_len * m + b
175
+ return mu
176
+
177
+
178
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
179
+ def retrieve_timesteps(
180
+ scheduler,
181
+ num_inference_steps: Optional[int] = None,
182
+ device: Optional[Union[str, torch.device]] = None,
183
+ timesteps: Optional[List[int]] = None,
184
+ sigmas: Optional[List[float]] = None,
185
+ **kwargs,
186
+ ):
187
+ r"""
188
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
189
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
190
+
191
+ Args:
192
+ scheduler (`SchedulerMixin`):
193
+ The scheduler to get timesteps from.
194
+ num_inference_steps (`int`):
195
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
196
+ must be `None`.
197
+ device (`str` or `torch.device`, *optional*):
198
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
199
+ timesteps (`List[int]`, *optional*):
200
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
201
+ `num_inference_steps` and `sigmas` must be `None`.
202
+ sigmas (`List[float]`, *optional*):
203
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
204
+ `num_inference_steps` and `timesteps` must be `None`.
205
+
206
+ Returns:
207
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
208
+ second element is the number of inference steps.
209
+ """
210
+ if timesteps is not None and sigmas is not None:
211
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
212
+ if timesteps is not None:
213
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
214
+ if not accepts_timesteps:
215
+ raise ValueError(
216
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
217
+ f" timestep schedules. Please check whether you are using the correct scheduler."
218
+ )
219
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
220
+ timesteps = scheduler.timesteps
221
+ num_inference_steps = len(timesteps)
222
+ elif sigmas is not None:
223
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
224
+ if not accept_sigmas:
225
+ raise ValueError(
226
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
227
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
228
+ )
229
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
230
+ timesteps = scheduler.timesteps
231
+ num_inference_steps = len(timesteps)
232
+ else:
233
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
234
+ timesteps = scheduler.timesteps
235
+ return timesteps, num_inference_steps
236
+
237
+
238
+ class FramepackSamplingType(str, Enum):
239
+ VANILLA = "vanilla"
240
+ INVERTED_ANTI_DRIFTING = "inverted_anti_drifting"
241
+
242
+
243
+ class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
244
+ r"""
245
+ Pipeline for text-to-video generation using HunyuanVideo.
246
+
247
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
248
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
249
+
250
+ Args:
251
+ text_encoder ([`LlamaModel`]):
252
+ [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
253
+ tokenizer (`LlamaTokenizer`):
254
+ Tokenizer from [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
255
+ transformer ([`HunyuanVideoTransformer3DModel`]):
256
+ Conditional Transformer to denoise the encoded image latents.
257
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
258
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
259
+ vae ([`AutoencoderKLHunyuanVideo`]):
260
+ Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
261
+ text_encoder_2 ([`CLIPTextModel`]):
262
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
263
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
264
+ tokenizer_2 (`CLIPTokenizer`):
265
+ Tokenizer of class
266
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
267
+ """
268
+
269
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
270
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
271
+
272
+ def __init__(
273
+ self,
274
+ text_encoder: LlamaModel,
275
+ tokenizer: LlamaTokenizerFast,
276
+ transformer: HunyuanVideoFramepackTransformer3DModel,
277
+ vae: AutoencoderKLHunyuanVideo,
278
+ scheduler: FlowMatchEulerDiscreteScheduler,
279
+ text_encoder_2: CLIPTextModel,
280
+ tokenizer_2: CLIPTokenizer,
281
+ image_encoder: SiglipVisionModel,
282
+ feature_extractor: SiglipImageProcessor,
283
+ ):
284
+ super().__init__()
285
+
286
+ self.register_modules(
287
+ vae=vae,
288
+ text_encoder=text_encoder,
289
+ tokenizer=tokenizer,
290
+ transformer=transformer,
291
+ scheduler=scheduler,
292
+ text_encoder_2=text_encoder_2,
293
+ tokenizer_2=tokenizer_2,
294
+ image_encoder=image_encoder,
295
+ feature_extractor=feature_extractor,
296
+ )
297
+
298
+ self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
299
+ self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 8
300
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
301
+
302
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline._get_llama_prompt_embeds
303
+ def _get_llama_prompt_embeds(
304
+ self,
305
+ prompt: Union[str, List[str]],
306
+ prompt_template: Dict[str, Any],
307
+ num_videos_per_prompt: int = 1,
308
+ device: Optional[torch.device] = None,
309
+ dtype: Optional[torch.dtype] = None,
310
+ max_sequence_length: int = 256,
311
+ num_hidden_layers_to_skip: int = 2,
312
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
313
+ device = device or self._execution_device
314
+ dtype = dtype or self.text_encoder.dtype
315
+
316
+ prompt = [prompt] if isinstance(prompt, str) else prompt
317
+ batch_size = len(prompt)
318
+
319
+ prompt = [prompt_template["template"].format(p) for p in prompt]
320
+
321
+ crop_start = prompt_template.get("crop_start", None)
322
+ if crop_start is None:
323
+ prompt_template_input = self.tokenizer(
324
+ prompt_template["template"],
325
+ padding="max_length",
326
+ return_tensors="pt",
327
+ return_length=False,
328
+ return_overflowing_tokens=False,
329
+ return_attention_mask=False,
330
+ )
331
+ crop_start = prompt_template_input["input_ids"].shape[-1]
332
+ # Remove <|eot_id|> token and placeholder {}
333
+ crop_start -= 2
334
+
335
+ max_sequence_length += crop_start
336
+ text_inputs = self.tokenizer(
337
+ prompt,
338
+ max_length=max_sequence_length,
339
+ padding="max_length",
340
+ truncation=True,
341
+ return_tensors="pt",
342
+ return_length=False,
343
+ return_overflowing_tokens=False,
344
+ return_attention_mask=True,
345
+ )
346
+ text_input_ids = text_inputs.input_ids.to(device=device)
347
+ prompt_attention_mask = text_inputs.attention_mask.to(device=device)
348
+
349
+ prompt_embeds = self.text_encoder(
350
+ input_ids=text_input_ids,
351
+ attention_mask=prompt_attention_mask,
352
+ output_hidden_states=True,
353
+ ).hidden_states[-(num_hidden_layers_to_skip + 1)]
354
+ prompt_embeds = prompt_embeds.to(dtype=dtype)
355
+
356
+ if crop_start is not None and crop_start > 0:
357
+ prompt_embeds = prompt_embeds[:, crop_start:]
358
+ prompt_attention_mask = prompt_attention_mask[:, crop_start:]
359
+
360
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
361
+ _, seq_len, _ = prompt_embeds.shape
362
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
363
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
364
+ prompt_attention_mask = prompt_attention_mask.repeat(1, num_videos_per_prompt)
365
+ prompt_attention_mask = prompt_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
366
+
367
+ return prompt_embeds, prompt_attention_mask
368
+
369
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline._get_clip_prompt_embeds
370
+ def _get_clip_prompt_embeds(
371
+ self,
372
+ prompt: Union[str, List[str]],
373
+ num_videos_per_prompt: int = 1,
374
+ device: Optional[torch.device] = None,
375
+ dtype: Optional[torch.dtype] = None,
376
+ max_sequence_length: int = 77,
377
+ ) -> torch.Tensor:
378
+ device = device or self._execution_device
379
+ dtype = dtype or self.text_encoder_2.dtype
380
+
381
+ prompt = [prompt] if isinstance(prompt, str) else prompt
382
+ batch_size = len(prompt)
383
+
384
+ text_inputs = self.tokenizer_2(
385
+ prompt,
386
+ padding="max_length",
387
+ max_length=max_sequence_length,
388
+ truncation=True,
389
+ return_tensors="pt",
390
+ )
391
+
392
+ text_input_ids = text_inputs.input_ids
393
+ untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
394
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
395
+ removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
396
+ logger.warning(
397
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
398
+ f" {max_sequence_length} tokens: {removed_text}"
399
+ )
400
+
401
+ prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False).pooler_output
402
+
403
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
404
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
405
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, -1)
406
+
407
+ return prompt_embeds
408
+
409
+ # Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline.encode_prompt
410
+ def encode_prompt(
411
+ self,
412
+ prompt: Union[str, List[str]],
413
+ prompt_2: Union[str, List[str]] = None,
414
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
415
+ num_videos_per_prompt: int = 1,
416
+ prompt_embeds: Optional[torch.Tensor] = None,
417
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
418
+ prompt_attention_mask: Optional[torch.Tensor] = None,
419
+ device: Optional[torch.device] = None,
420
+ dtype: Optional[torch.dtype] = None,
421
+ max_sequence_length: int = 256,
422
+ ):
423
+ if prompt_embeds is None:
424
+ prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
425
+ prompt,
426
+ prompt_template,
427
+ num_videos_per_prompt,
428
+ device=device,
429
+ dtype=dtype,
430
+ max_sequence_length=max_sequence_length,
431
+ )
432
+
433
+ if pooled_prompt_embeds is None:
434
+ if prompt_2 is None:
435
+ prompt_2 = prompt
436
+ pooled_prompt_embeds = self._get_clip_prompt_embeds(
437
+ prompt,
438
+ num_videos_per_prompt,
439
+ device=device,
440
+ dtype=dtype,
441
+ max_sequence_length=77,
442
+ )
443
+
444
+ return prompt_embeds, pooled_prompt_embeds, prompt_attention_mask
445
+
446
+ def encode_image(
447
+ self, image: torch.Tensor, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None
448
+ ):
449
+ device = device or self._execution_device
450
+ image = (image + 1) / 2.0 # [-1, 1] -> [0, 1]
451
+ image = self.feature_extractor(images=image, return_tensors="pt", do_rescale=False).to(
452
+ device=device, dtype=self.image_encoder.dtype
453
+ )
454
+ image_embeds = self.image_encoder(**image).last_hidden_state
455
+ return image_embeds.to(dtype=dtype)
456
+
457
+ def check_inputs(
458
+ self,
459
+ prompt,
460
+ prompt_2,
461
+ height,
462
+ width,
463
+ prompt_embeds=None,
464
+ callback_on_step_end_tensor_inputs=None,
465
+ prompt_template=None,
466
+ image=None,
467
+ image_latents=None,
468
+ last_image=None,
469
+ last_image_latents=None,
470
+ sampling_type=None,
471
+ ):
472
+ if height % 16 != 0 or width % 16 != 0:
473
+ raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
474
+
475
+ if callback_on_step_end_tensor_inputs is not None and not all(
476
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
477
+ ):
478
+ raise ValueError(
479
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
480
+ )
481
+
482
+ if prompt is not None and prompt_embeds is not None:
483
+ raise ValueError(
484
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
485
+ " only forward one of the two."
486
+ )
487
+ elif prompt_2 is not None and prompt_embeds is not None:
488
+ raise ValueError(
489
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
490
+ " only forward one of the two."
491
+ )
492
+ elif prompt is None and prompt_embeds is None:
493
+ raise ValueError(
494
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
495
+ )
496
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
497
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
498
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
499
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
500
+
501
+ if prompt_template is not None:
502
+ if not isinstance(prompt_template, dict):
503
+ raise ValueError(f"`prompt_template` has to be of type `dict` but is {type(prompt_template)}")
504
+ if "template" not in prompt_template:
505
+ raise ValueError(
506
+ f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
507
+ )
508
+
509
+ sampling_types = [x.value for x in FramepackSamplingType.__members__.values()]
510
+ if sampling_type not in sampling_types:
511
+ raise ValueError(f"`sampling_type` has to be one of '{sampling_types}' but is '{sampling_type}'")
512
+
513
+ if image is not None and image_latents is not None:
514
+ raise ValueError("Only one of `image` or `image_latents` can be passed.")
515
+ if last_image is not None and last_image_latents is not None:
516
+ raise ValueError("Only one of `last_image` or `last_image_latents` can be passed.")
517
+ if sampling_type != FramepackSamplingType.INVERTED_ANTI_DRIFTING and (
518
+ last_image is not None or last_image_latents is not None
519
+ ):
520
+ raise ValueError(
521
+ 'Only `"inverted_anti_drifting"` inference type supports `last_image` or `last_image_latents`.'
522
+ )
523
+
524
+ def prepare_latents(
525
+ self,
526
+ batch_size: int = 1,
527
+ num_channels_latents: int = 16,
528
+ height: int = 720,
529
+ width: int = 1280,
530
+ num_frames: int = 129,
531
+ dtype: Optional[torch.dtype] = None,
532
+ device: Optional[torch.device] = None,
533
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
534
+ latents: Optional[torch.Tensor] = None,
535
+ ) -> torch.Tensor:
536
+ if latents is not None:
537
+ return latents.to(device=device, dtype=dtype)
538
+ shape = (
539
+ batch_size,
540
+ num_channels_latents,
541
+ (num_frames - 1) // self.vae_scale_factor_temporal + 1,
542
+ int(height) // self.vae_scale_factor_spatial,
543
+ int(width) // self.vae_scale_factor_spatial,
544
+ )
545
+ if isinstance(generator, list) and len(generator) != batch_size:
546
+ raise ValueError(
547
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
548
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
549
+ )
550
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
551
+ return latents
552
+
553
+ def prepare_image_latents(
554
+ self,
555
+ image: torch.Tensor,
556
+ dtype: Optional[torch.dtype] = None,
557
+ device: Optional[torch.device] = None,
558
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
559
+ latents: Optional[torch.Tensor] = None,
560
+ ) -> torch.Tensor:
561
+ device = device or self._execution_device
562
+ if latents is None:
563
+ image = image.unsqueeze(2).to(device=device, dtype=self.vae.dtype)
564
+ latents = self.vae.encode(image).latent_dist.sample(generator=generator)
565
+ latents = latents * self.vae.config.scaling_factor
566
+ return latents.to(device=device, dtype=dtype)
567
+
568
+ def enable_vae_slicing(self):
569
+ r"""
570
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
571
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
572
+ """
573
+ self.vae.enable_slicing()
574
+
575
+ def disable_vae_slicing(self):
576
+ r"""
577
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
578
+ computing decoding in one step.
579
+ """
580
+ self.vae.disable_slicing()
581
+
582
+ def enable_vae_tiling(self):
583
+ r"""
584
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
585
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
586
+ processing larger images.
587
+ """
588
+ self.vae.enable_tiling()
589
+
590
+ def disable_vae_tiling(self):
591
+ r"""
592
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
593
+ computing decoding in one step.
594
+ """
595
+ self.vae.disable_tiling()
596
+
597
+ @property
598
+ def guidance_scale(self):
599
+ return self._guidance_scale
600
+
601
+ @property
602
+ def num_timesteps(self):
603
+ return self._num_timesteps
604
+
605
+ @property
606
+ def attention_kwargs(self):
607
+ return self._attention_kwargs
608
+
609
+ @property
610
+ def current_timestep(self):
611
+ return self._current_timestep
612
+
613
+ @property
614
+ def interrupt(self):
615
+ return self._interrupt
616
+
617
+ @torch.no_grad()
618
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
619
+ def __call__(
620
+ self,
621
+ image: PipelineImageInput,
622
+ last_image: Optional[PipelineImageInput] = None,
623
+ prompt: Union[str, List[str]] = None,
624
+ prompt_2: Union[str, List[str]] = None,
625
+ negative_prompt: Union[str, List[str]] = None,
626
+ negative_prompt_2: Union[str, List[str]] = None,
627
+ height: int = 720,
628
+ width: int = 1280,
629
+ num_frames: int = 129,
630
+ latent_window_size: int = 9,
631
+ num_inference_steps: int = 50,
632
+ sigmas: List[float] = None,
633
+ true_cfg_scale: float = 1.0,
634
+ guidance_scale: float = 6.0,
635
+ num_videos_per_prompt: Optional[int] = 1,
636
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
637
+ image_latents: Optional[torch.Tensor] = None,
638
+ last_image_latents: Optional[torch.Tensor] = None,
639
+ prompt_embeds: Optional[torch.Tensor] = None,
640
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
641
+ prompt_attention_mask: Optional[torch.Tensor] = None,
642
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
643
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
644
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
645
+ output_type: Optional[str] = "pil",
646
+ return_dict: bool = True,
647
+ attention_kwargs: Optional[Dict[str, Any]] = None,
648
+ callback_on_step_end: Optional[
649
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
650
+ ] = None,
651
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
652
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
653
+ max_sequence_length: int = 256,
654
+ sampling_type: FramepackSamplingType = FramepackSamplingType.INVERTED_ANTI_DRIFTING,
655
+ ):
656
+ r"""
657
+ The call function to the pipeline for generation.
658
+
659
+ Args:
660
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
661
+ The image to be used as the starting point for the video generation.
662
+ last_image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`, *optional*):
663
+ The optional last image to be used as the ending point for the video generation. This is useful for
664
+ generating transitions between two images.
665
+ prompt (`str` or `List[str]`, *optional*):
666
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
667
+ instead.
668
+ prompt_2 (`str` or `List[str]`, *optional*):
669
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
670
+ will be used instead.
671
+ negative_prompt (`str` or `List[str]`, *optional*):
672
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
673
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
674
+ not greater than `1`).
675
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
676
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
677
+ `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
678
+ height (`int`, defaults to `720`):
679
+ The height in pixels of the generated image.
680
+ width (`int`, defaults to `1280`):
681
+ The width in pixels of the generated image.
682
+ num_frames (`int`, defaults to `129`):
683
+ The number of frames in the generated video.
684
+ num_inference_steps (`int`, defaults to `50`):
685
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
686
+ expense of slower inference.
687
+ sigmas (`List[float]`, *optional*):
688
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
689
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
690
+ will be used.
691
+ true_cfg_scale (`float`, *optional*, defaults to 1.0):
692
+ When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
693
+ guidance_scale (`float`, defaults to `6.0`):
694
+ Guidance scale as defined in [Classifier-Free Diffusion
695
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
696
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
697
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
698
+ the text `prompt`, usually at the expense of lower image quality. Note that the only available
699
+ HunyuanVideo model is CFG-distilled, which means that traditional guidance between unconditional and
700
+ conditional latent is not applied.
701
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
702
+ The number of images to generate per prompt.
703
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
704
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
705
+ generation deterministic.
706
+ image_latents (`torch.Tensor`, *optional*):
707
+ Pre-encoded image latents. If not provided, the image will be encoded using the VAE.
708
+ last_image_latents (`torch.Tensor`, *optional*):
709
+ Pre-encoded last image latents. If not provided, the last image will be encoded using the VAE.
710
+ prompt_embeds (`torch.Tensor`, *optional*):
711
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
712
+ provided, text embeddings are generated from the `prompt` input argument.
713
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
714
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
715
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
716
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
717
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
718
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
719
+ argument.
720
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
721
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
722
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
723
+ input argument.
724
+ output_type (`str`, *optional*, defaults to `"pil"`):
725
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
726
+ return_dict (`bool`, *optional*, defaults to `True`):
727
+ Whether or not to return a [`HunyuanVideoFramepackPipelineOutput`] instead of a plain tuple.
728
+ attention_kwargs (`dict`, *optional*):
729
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
730
+ `self.processor` in
731
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
732
+ clip_skip (`int`, *optional*):
733
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
734
+ the output of the pre-final layer will be used for computing the prompt embeddings.
735
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
736
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
737
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
738
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
739
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
740
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
741
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
742
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
743
+ `._callback_tensor_inputs` attribute of your pipeline class.
744
+
745
+ Examples:
746
+
747
+ Returns:
748
+ [`~HunyuanVideoFramepackPipelineOutput`] or `tuple`:
749
+ If `return_dict` is `True`, [`HunyuanVideoFramepackPipelineOutput`] is returned, otherwise a `tuple` is
750
+ returned where the first element is a list with the generated images and the second element is a list
751
+ of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw)
752
+ content.
753
+ """
754
+
755
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
756
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
757
+
758
+ # 1. Check inputs. Raise error if not correct
759
+ self.check_inputs(
760
+ prompt,
761
+ prompt_2,
762
+ height,
763
+ width,
764
+ prompt_embeds,
765
+ callback_on_step_end_tensor_inputs,
766
+ prompt_template,
767
+ image,
768
+ image_latents,
769
+ last_image,
770
+ last_image_latents,
771
+ sampling_type,
772
+ )
773
+
774
+ has_neg_prompt = negative_prompt is not None or (
775
+ negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
776
+ )
777
+ do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
778
+
779
+ self._guidance_scale = guidance_scale
780
+ self._attention_kwargs = attention_kwargs
781
+ self._current_timestep = None
782
+ self._interrupt = False
783
+
784
+ device = self._execution_device
785
+ transformer_dtype = self.transformer.dtype
786
+ vae_dtype = self.vae.dtype
787
+
788
+ # 2. Define call parameters
789
+ if prompt is not None and isinstance(prompt, str):
790
+ batch_size = 1
791
+ elif prompt is not None and isinstance(prompt, list):
792
+ batch_size = len(prompt)
793
+ else:
794
+ batch_size = prompt_embeds.shape[0]
795
+
796
+ # 3. Encode input prompt
797
+ transformer_dtype = self.transformer.dtype
798
+ prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
799
+ prompt=prompt,
800
+ prompt_2=prompt_2,
801
+ prompt_template=prompt_template,
802
+ num_videos_per_prompt=num_videos_per_prompt,
803
+ prompt_embeds=prompt_embeds,
804
+ pooled_prompt_embeds=pooled_prompt_embeds,
805
+ prompt_attention_mask=prompt_attention_mask,
806
+ device=device,
807
+ max_sequence_length=max_sequence_length,
808
+ )
809
+ prompt_embeds = prompt_embeds.to(transformer_dtype)
810
+ prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
811
+ pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
812
+
813
+ if do_true_cfg:
814
+ negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
815
+ prompt=negative_prompt,
816
+ prompt_2=negative_prompt_2,
817
+ prompt_template=prompt_template,
818
+ num_videos_per_prompt=num_videos_per_prompt,
819
+ prompt_embeds=negative_prompt_embeds,
820
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
821
+ prompt_attention_mask=negative_prompt_attention_mask,
822
+ device=device,
823
+ max_sequence_length=max_sequence_length,
824
+ )
825
+ negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
826
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(transformer_dtype)
827
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(transformer_dtype)
828
+
829
+ # 4. Prepare image
830
+ image = self.video_processor.preprocess(image, height, width)
831
+ image_embeds = self.encode_image(image, device=device).to(transformer_dtype)
832
+ if last_image is not None:
833
+ # Credits: https://github.com/lllyasviel/FramePack/pull/167
834
+ # Users can modify the weighting strategy applied here
835
+ last_image = self.video_processor.preprocess(last_image, height, width)
836
+ last_image_embeds = self.encode_image(last_image, device=device).to(transformer_dtype)
837
+ last_image_embeds = (image_embeds + last_image_embeds) / 2
838
+
839
+ # 5. Prepare latent variables
840
+ num_channels_latents = self.transformer.config.in_channels
841
+ window_num_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
842
+ num_latent_sections = max(1, (num_frames + window_num_frames - 1) // window_num_frames)
843
+ history_video = None
844
+ total_generated_latent_frames = 0
845
+
846
+ image_latents = self.prepare_image_latents(
847
+ image, dtype=torch.float32, device=device, generator=generator, latents=image_latents
848
+ )
849
+ if last_image is not None:
850
+ last_image_latents = self.prepare_image_latents(
851
+ last_image, dtype=torch.float32, device=device, generator=generator
852
+ )
853
+
854
+ # Specific to the released checkpoints:
855
+ # - https://huggingface.co/lllyasviel/FramePackI2V_HY
856
+ # - https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503
857
+ # TODO: find a more generic way in future if there are more checkpoints
858
+ if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
859
+ history_sizes = [1, 2, 16]
860
+ history_latents = torch.zeros(
861
+ batch_size,
862
+ num_channels_latents,
863
+ sum(history_sizes),
864
+ height // self.vae_scale_factor_spatial,
865
+ width // self.vae_scale_factor_spatial,
866
+ device=device,
867
+ dtype=torch.float32,
868
+ )
869
+
870
+ elif sampling_type == FramepackSamplingType.VANILLA:
871
+ history_sizes = [16, 2, 1]
872
+ history_latents = torch.zeros(
873
+ batch_size,
874
+ num_channels_latents,
875
+ sum(history_sizes),
876
+ height // self.vae_scale_factor_spatial,
877
+ width // self.vae_scale_factor_spatial,
878
+ device=device,
879
+ dtype=torch.float32,
880
+ )
881
+ history_latents = torch.cat([history_latents, image_latents], dim=2)
882
+ total_generated_latent_frames += 1
883
+
884
+ else:
885
+ assert False
886
+
887
+ # 6. Prepare guidance condition
888
+ guidance = torch.tensor([guidance_scale] * batch_size, dtype=transformer_dtype, device=device) * 1000.0
889
+
890
+ # 7. Denoising loop
891
+ for k in range(num_latent_sections):
892
+ if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
893
+ latent_paddings = list(reversed(range(num_latent_sections)))
894
+ if num_latent_sections > 4:
895
+ latent_paddings = [3] + [2] * (num_latent_sections - 3) + [1, 0]
896
+
897
+ is_first_section = k == 0
898
+ is_last_section = k == num_latent_sections - 1
899
+ latent_padding_size = latent_paddings[k] * latent_window_size
900
+
901
+ indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, *history_sizes]))
902
+ (
903
+ indices_prefix,
904
+ indices_padding,
905
+ indices_latents,
906
+ indices_latents_history_1x,
907
+ indices_latents_history_2x,
908
+ indices_latents_history_4x,
909
+ ) = indices.split([1, latent_padding_size, latent_window_size, *history_sizes], dim=0)
910
+ # Inverted anti-drifting sampling: Figure 2(c) in the paper
911
+ indices_clean_latents = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
912
+
913
+ latents_prefix = image_latents
914
+ latents_history_1x, latents_history_2x, latents_history_4x = history_latents[
915
+ :, :, : sum(history_sizes)
916
+ ].split(history_sizes, dim=2)
917
+ if last_image is not None and is_first_section:
918
+ latents_history_1x = last_image_latents
919
+ latents_clean = torch.cat([latents_prefix, latents_history_1x], dim=2)
920
+
921
+ elif sampling_type == FramepackSamplingType.VANILLA:
922
+ indices = torch.arange(0, sum([1, *history_sizes, latent_window_size]))
923
+ (
924
+ indices_prefix,
925
+ indices_latents_history_4x,
926
+ indices_latents_history_2x,
927
+ indices_latents_history_1x,
928
+ indices_latents,
929
+ ) = indices.split([1, *history_sizes, latent_window_size], dim=0)
930
+ indices_clean_latents = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
931
+
932
+ latents_prefix = image_latents
933
+ latents_history_4x, latents_history_2x, latents_history_1x = history_latents[
934
+ :, :, -sum(history_sizes) :
935
+ ].split(history_sizes, dim=2)
936
+ latents_clean = torch.cat([latents_prefix, latents_history_1x], dim=2)
937
+
938
+ else:
939
+ assert False
940
+
941
+ latents = self.prepare_latents(
942
+ batch_size,
943
+ num_channels_latents,
944
+ height,
945
+ width,
946
+ window_num_frames,
947
+ dtype=torch.float32,
948
+ device=device,
949
+ generator=generator,
950
+ latents=None,
951
+ )
952
+
953
+ sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
954
+ image_seq_len = (
955
+ latents.shape[2] * latents.shape[3] * latents.shape[4] / self.transformer.config.patch_size**2
956
+ )
957
+ exp_max = 7.0
958
+ mu = calculate_shift(
959
+ image_seq_len,
960
+ self.scheduler.config.get("base_image_seq_len", 256),
961
+ self.scheduler.config.get("max_image_seq_len", 4096),
962
+ self.scheduler.config.get("base_shift", 0.5),
963
+ self.scheduler.config.get("max_shift", 1.15),
964
+ )
965
+ mu = min(mu, math.log(exp_max))
966
+ timesteps, num_inference_steps = retrieve_timesteps(
967
+ self.scheduler, num_inference_steps, device, sigmas=sigmas, mu=mu
968
+ )
969
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
970
+ self._num_timesteps = len(timesteps)
971
+
972
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
973
+ for i, t in enumerate(timesteps):
974
+ if self.interrupt:
975
+ continue
976
+
977
+ self._current_timestep = t
978
+ timestep = t.expand(latents.shape[0])
979
+
980
+ noise_pred = self.transformer(
981
+ hidden_states=latents.to(transformer_dtype),
982
+ timestep=timestep,
983
+ encoder_hidden_states=prompt_embeds,
984
+ encoder_attention_mask=prompt_attention_mask,
985
+ pooled_projections=pooled_prompt_embeds,
986
+ image_embeds=image_embeds,
987
+ indices_latents=indices_latents,
988
+ guidance=guidance,
989
+ latents_clean=latents_clean.to(transformer_dtype),
990
+ indices_latents_clean=indices_clean_latents,
991
+ latents_history_2x=latents_history_2x.to(transformer_dtype),
992
+ indices_latents_history_2x=indices_latents_history_2x,
993
+ latents_history_4x=latents_history_4x.to(transformer_dtype),
994
+ indices_latents_history_4x=indices_latents_history_4x,
995
+ attention_kwargs=attention_kwargs,
996
+ return_dict=False,
997
+ )[0]
998
+
999
+ if do_true_cfg:
1000
+ neg_noise_pred = self.transformer(
1001
+ hidden_states=latents.to(transformer_dtype),
1002
+ timestep=timestep,
1003
+ encoder_hidden_states=negative_prompt_embeds,
1004
+ encoder_attention_mask=negative_prompt_attention_mask,
1005
+ pooled_projections=negative_pooled_prompt_embeds,
1006
+ image_embeds=image_embeds,
1007
+ indices_latents=indices_latents,
1008
+ guidance=guidance,
1009
+ latents_clean=latents_clean.to(transformer_dtype),
1010
+ indices_latents_clean=indices_clean_latents,
1011
+ latents_history_2x=latents_history_2x.to(transformer_dtype),
1012
+ indices_latents_history_2x=indices_latents_history_2x,
1013
+ latents_history_4x=latents_history_4x.to(transformer_dtype),
1014
+ indices_latents_history_4x=indices_latents_history_4x,
1015
+ attention_kwargs=attention_kwargs,
1016
+ return_dict=False,
1017
+ )[0]
1018
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
1019
+
1020
+ # compute the previous noisy sample x_t -> x_t-1
1021
+ latents = self.scheduler.step(noise_pred.float(), t, latents, return_dict=False)[0]
1022
+
1023
+ if callback_on_step_end is not None:
1024
+ callback_kwargs = {}
1025
+ for k in callback_on_step_end_tensor_inputs:
1026
+ callback_kwargs[k] = locals()[k]
1027
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1028
+
1029
+ latents = callback_outputs.pop("latents", latents)
1030
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1031
+
1032
+ # call the callback, if provided
1033
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1034
+ progress_bar.update()
1035
+
1036
+ if XLA_AVAILABLE:
1037
+ xm.mark_step()
1038
+
1039
+ if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
1040
+ if is_last_section:
1041
+ latents = torch.cat([image_latents, latents], dim=2)
1042
+ total_generated_latent_frames += latents.shape[2]
1043
+ history_latents = torch.cat([latents, history_latents], dim=2)
1044
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames]
1045
+ section_latent_frames = (
1046
+ (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
1047
+ )
1048
+ index_slice = (slice(None), slice(None), slice(0, section_latent_frames))
1049
+
1050
+ elif sampling_type == FramepackSamplingType.VANILLA:
1051
+ total_generated_latent_frames += latents.shape[2]
1052
+ history_latents = torch.cat([history_latents, latents], dim=2)
1053
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:]
1054
+ section_latent_frames = latent_window_size * 2
1055
+ index_slice = (slice(None), slice(None), slice(-section_latent_frames, None))
1056
+
1057
+ else:
1058
+ assert False
1059
+
1060
+ if history_video is None:
1061
+ if not output_type == "latent":
1062
+ current_latents = real_history_latents.to(vae_dtype) / self.vae.config.scaling_factor
1063
+ history_video = self.vae.decode(current_latents, return_dict=False)[0]
1064
+ else:
1065
+ history_video = [real_history_latents]
1066
+ else:
1067
+ if not output_type == "latent":
1068
+ overlapped_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
1069
+ current_latents = (
1070
+ real_history_latents[index_slice].to(vae_dtype) / self.vae.config.scaling_factor
1071
+ )
1072
+ current_video = self.vae.decode(current_latents, return_dict=False)[0]
1073
+
1074
+ if sampling_type == FramepackSamplingType.INVERTED_ANTI_DRIFTING:
1075
+ history_video = self._soft_append(current_video, history_video, overlapped_frames)
1076
+ elif sampling_type == FramepackSamplingType.VANILLA:
1077
+ history_video = self._soft_append(history_video, current_video, overlapped_frames)
1078
+ else:
1079
+ assert False
1080
+ else:
1081
+ history_video.append(real_history_latents)
1082
+
1083
+ self._current_timestep = None
1084
+
1085
+ if not output_type == "latent":
1086
+ generated_frames = history_video.size(2)
1087
+ generated_frames = (
1088
+ generated_frames - 1
1089
+ ) // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
1090
+ history_video = history_video[:, :, :generated_frames]
1091
+ video = self.video_processor.postprocess_video(history_video, output_type=output_type)
1092
+ else:
1093
+ video = history_video
1094
+
1095
+ # Offload all models
1096
+ self.maybe_free_model_hooks()
1097
+
1098
+ if not return_dict:
1099
+ return (video,)
1100
+
1101
+ return HunyuanVideoFramepackPipelineOutput(frames=video)
1102
+
1103
+ def _soft_append(self, history: torch.Tensor, current: torch.Tensor, overlap: int = 0):
1104
+ if overlap <= 0:
1105
+ return torch.cat([history, current], dim=2)
1106
+
1107
+ assert history.shape[2] >= overlap, f"Current length ({history.shape[2]}) must be >= overlap ({overlap})"
1108
+ assert current.shape[2] >= overlap, f"History length ({current.shape[2]}) must be >= overlap ({overlap})"
1109
+
1110
+ weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
1111
+ blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
1112
+ output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
1113
+
1114
+ return output.to(history)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py ADDED
@@ -0,0 +1,980 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import (
22
+ CLIPImageProcessor,
23
+ CLIPTextModel,
24
+ CLIPTokenizer,
25
+ LlamaTokenizerFast,
26
+ LlavaForConditionalGeneration,
27
+ )
28
+
29
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
30
+ from ...loaders import HunyuanVideoLoraLoaderMixin
31
+ from ...models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel
32
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
33
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
34
+ from ...utils.torch_utils import randn_tensor
35
+ from ...video_processor import VideoProcessor
36
+ from ..pipeline_utils import DiffusionPipeline
37
+ from .pipeline_output import HunyuanVideoPipelineOutput
38
+
39
+
40
+ if is_torch_xla_available():
41
+ import torch_xla.core.xla_model as xm
42
+
43
+ XLA_AVAILABLE = True
44
+ else:
45
+ XLA_AVAILABLE = False
46
+
47
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
48
+
49
+
50
+ EXAMPLE_DOC_STRING = """
51
+ Examples:
52
+ ```python
53
+ >>> import torch
54
+ >>> from diffusers import HunyuanVideoImageToVideoPipeline, HunyuanVideoTransformer3DModel
55
+ >>> from diffusers.utils import load_image, export_to_video
56
+
57
+ >>> # Available checkpoints: hunyuanvideo-community/HunyuanVideo-I2V, hunyuanvideo-community/HunyuanVideo-I2V-33ch
58
+ >>> model_id = "hunyuanvideo-community/HunyuanVideo-I2V"
59
+ >>> transformer = HunyuanVideoTransformer3DModel.from_pretrained(
60
+ ... model_id, subfolder="transformer", torch_dtype=torch.bfloat16
61
+ ... )
62
+ >>> pipe = HunyuanVideoImageToVideoPipeline.from_pretrained(
63
+ ... model_id, transformer=transformer, torch_dtype=torch.float16
64
+ ... )
65
+ >>> pipe.vae.enable_tiling()
66
+ >>> pipe.to("cuda")
67
+
68
+ >>> prompt = "A man with short gray hair plays a red electric guitar."
69
+ >>> image = load_image(
70
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png"
71
+ ... )
72
+
73
+ >>> # If using hunyuanvideo-community/HunyuanVideo-I2V
74
+ >>> output = pipe(image=image, prompt=prompt, guidance_scale=6.0).frames[0]
75
+
76
+ >>> # If using hunyuanvideo-community/HunyuanVideo-I2V-33ch
77
+ >>> output = pipe(image=image, prompt=prompt, guidance_scale=1.0, true_cfg_scale=1.0).frames[0]
78
+
79
+ >>> export_to_video(output, "output.mp4", fps=15)
80
+ ```
81
+ """
82
+
83
+
84
+ DEFAULT_PROMPT_TEMPLATE = {
85
+ "template": (
86
+ "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
87
+ "1. The main content and theme of the video."
88
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
89
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
90
+ "4. background environment, light, style and atmosphere."
91
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
92
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
93
+ "<|start_header_id|>assistant<|end_header_id|>\n\n"
94
+ ),
95
+ "crop_start": 103,
96
+ "image_emb_start": 5,
97
+ "image_emb_end": 581,
98
+ "image_emb_len": 576,
99
+ "double_return_token_id": 271,
100
+ }
101
+
102
+
103
+ def _expand_input_ids_with_image_tokens(
104
+ text_input_ids,
105
+ prompt_attention_mask,
106
+ max_sequence_length,
107
+ image_token_index,
108
+ image_emb_len,
109
+ image_emb_start,
110
+ image_emb_end,
111
+ pad_token_id,
112
+ ):
113
+ special_image_token_mask = text_input_ids == image_token_index
114
+ num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
115
+ batch_indices, non_image_indices = torch.where(text_input_ids != image_token_index)
116
+
117
+ max_expanded_length = max_sequence_length + (num_special_image_tokens.max() * (image_emb_len - 1))
118
+ new_token_positions = torch.cumsum((special_image_token_mask * (image_emb_len - 1) + 1), -1) - 1
119
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
120
+
121
+ expanded_input_ids = torch.full(
122
+ (text_input_ids.shape[0], max_expanded_length),
123
+ pad_token_id,
124
+ dtype=text_input_ids.dtype,
125
+ device=text_input_ids.device,
126
+ )
127
+ expanded_input_ids[batch_indices, text_to_overwrite] = text_input_ids[batch_indices, non_image_indices]
128
+ expanded_input_ids[batch_indices, image_emb_start:image_emb_end] = image_token_index
129
+
130
+ expanded_attention_mask = torch.zeros(
131
+ (text_input_ids.shape[0], max_expanded_length),
132
+ dtype=prompt_attention_mask.dtype,
133
+ device=prompt_attention_mask.device,
134
+ )
135
+ attn_batch_indices, attention_indices = torch.where(expanded_input_ids != pad_token_id)
136
+ expanded_attention_mask[attn_batch_indices, attention_indices] = 1.0
137
+ expanded_attention_mask = expanded_attention_mask.to(prompt_attention_mask.dtype)
138
+ position_ids = (expanded_attention_mask.cumsum(-1) - 1).masked_fill_((expanded_attention_mask == 0), 1)
139
+
140
+ return {
141
+ "input_ids": expanded_input_ids,
142
+ "attention_mask": expanded_attention_mask,
143
+ "position_ids": position_ids,
144
+ }
145
+
146
+
147
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
148
+ def retrieve_timesteps(
149
+ scheduler,
150
+ num_inference_steps: Optional[int] = None,
151
+ device: Optional[Union[str, torch.device]] = None,
152
+ timesteps: Optional[List[int]] = None,
153
+ sigmas: Optional[List[float]] = None,
154
+ **kwargs,
155
+ ):
156
+ r"""
157
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
158
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
159
+
160
+ Args:
161
+ scheduler (`SchedulerMixin`):
162
+ The scheduler to get timesteps from.
163
+ num_inference_steps (`int`):
164
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
165
+ must be `None`.
166
+ device (`str` or `torch.device`, *optional*):
167
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
168
+ timesteps (`List[int]`, *optional*):
169
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
170
+ `num_inference_steps` and `sigmas` must be `None`.
171
+ sigmas (`List[float]`, *optional*):
172
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
173
+ `num_inference_steps` and `timesteps` must be `None`.
174
+
175
+ Returns:
176
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
177
+ second element is the number of inference steps.
178
+ """
179
+ if timesteps is not None and sigmas is not None:
180
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
181
+ if timesteps is not None:
182
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
183
+ if not accepts_timesteps:
184
+ raise ValueError(
185
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
186
+ f" timestep schedules. Please check whether you are using the correct scheduler."
187
+ )
188
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
189
+ timesteps = scheduler.timesteps
190
+ num_inference_steps = len(timesteps)
191
+ elif sigmas is not None:
192
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
193
+ if not accept_sigmas:
194
+ raise ValueError(
195
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
196
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
197
+ )
198
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
199
+ timesteps = scheduler.timesteps
200
+ num_inference_steps = len(timesteps)
201
+ else:
202
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
203
+ timesteps = scheduler.timesteps
204
+ return timesteps, num_inference_steps
205
+
206
+
207
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
208
+ def retrieve_latents(
209
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
210
+ ):
211
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
212
+ return encoder_output.latent_dist.sample(generator)
213
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
214
+ return encoder_output.latent_dist.mode()
215
+ elif hasattr(encoder_output, "latents"):
216
+ return encoder_output.latents
217
+ else:
218
+ raise AttributeError("Could not access latents of provided encoder_output")
219
+
220
+
221
+ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
222
+ r"""
223
+ Pipeline for image-to-video generation using HunyuanVideo.
224
+
225
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
226
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
227
+
228
+ Args:
229
+ text_encoder ([`LlavaForConditionalGeneration`]):
230
+ [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
231
+ tokenizer (`LlamaTokenizer`):
232
+ Tokenizer from [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
233
+ transformer ([`HunyuanVideoTransformer3DModel`]):
234
+ Conditional Transformer to denoise the encoded image latents.
235
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
236
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
237
+ vae ([`AutoencoderKLHunyuanVideo`]):
238
+ Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
239
+ text_encoder_2 ([`CLIPTextModel`]):
240
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
241
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
242
+ tokenizer_2 (`CLIPTokenizer`):
243
+ Tokenizer of class
244
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
245
+ """
246
+
247
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
248
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
249
+
250
+ def __init__(
251
+ self,
252
+ text_encoder: LlavaForConditionalGeneration,
253
+ tokenizer: LlamaTokenizerFast,
254
+ transformer: HunyuanVideoTransformer3DModel,
255
+ vae: AutoencoderKLHunyuanVideo,
256
+ scheduler: FlowMatchEulerDiscreteScheduler,
257
+ text_encoder_2: CLIPTextModel,
258
+ tokenizer_2: CLIPTokenizer,
259
+ image_processor: CLIPImageProcessor,
260
+ ):
261
+ super().__init__()
262
+
263
+ self.register_modules(
264
+ vae=vae,
265
+ text_encoder=text_encoder,
266
+ tokenizer=tokenizer,
267
+ transformer=transformer,
268
+ scheduler=scheduler,
269
+ text_encoder_2=text_encoder_2,
270
+ tokenizer_2=tokenizer_2,
271
+ image_processor=image_processor,
272
+ )
273
+
274
+ self.vae_scaling_factor = self.vae.config.scaling_factor if getattr(self, "vae", None) else 0.476986
275
+ self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
276
+ self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 8
277
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
278
+
279
+ def _get_llama_prompt_embeds(
280
+ self,
281
+ image: torch.Tensor,
282
+ prompt: Union[str, List[str]],
283
+ prompt_template: Dict[str, Any],
284
+ num_videos_per_prompt: int = 1,
285
+ device: Optional[torch.device] = None,
286
+ dtype: Optional[torch.dtype] = None,
287
+ max_sequence_length: int = 256,
288
+ num_hidden_layers_to_skip: int = 2,
289
+ image_embed_interleave: int = 2,
290
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
291
+ device = device or self._execution_device
292
+ dtype = dtype or self.text_encoder.dtype
293
+
294
+ prompt = [prompt] if isinstance(prompt, str) else prompt
295
+ prompt = [prompt_template["template"].format(p) for p in prompt]
296
+
297
+ crop_start = prompt_template.get("crop_start", None)
298
+
299
+ image_emb_len = prompt_template.get("image_emb_len", 576)
300
+ image_emb_start = prompt_template.get("image_emb_start", 5)
301
+ image_emb_end = prompt_template.get("image_emb_end", 581)
302
+ double_return_token_id = prompt_template.get("double_return_token_id", 271)
303
+
304
+ if crop_start is None:
305
+ prompt_template_input = self.tokenizer(
306
+ prompt_template["template"],
307
+ padding="max_length",
308
+ return_tensors="pt",
309
+ return_length=False,
310
+ return_overflowing_tokens=False,
311
+ return_attention_mask=False,
312
+ )
313
+ crop_start = prompt_template_input["input_ids"].shape[-1]
314
+ # Remove <|start_header_id|>, <|end_header_id|>, assistant, <|eot_id|>, and placeholder {}
315
+ crop_start -= 5
316
+
317
+ max_sequence_length += crop_start
318
+ text_inputs = self.tokenizer(
319
+ prompt,
320
+ max_length=max_sequence_length,
321
+ padding="max_length",
322
+ truncation=True,
323
+ return_tensors="pt",
324
+ return_length=False,
325
+ return_overflowing_tokens=False,
326
+ return_attention_mask=True,
327
+ )
328
+ text_input_ids = text_inputs.input_ids.to(device=device)
329
+ prompt_attention_mask = text_inputs.attention_mask.to(device=device)
330
+
331
+ image_embeds = self.image_processor(image, return_tensors="pt").pixel_values.to(device)
332
+
333
+ image_token_index = self.text_encoder.config.image_token_index
334
+ pad_token_id = self.text_encoder.config.pad_token_id
335
+ expanded_inputs = _expand_input_ids_with_image_tokens(
336
+ text_input_ids,
337
+ prompt_attention_mask,
338
+ max_sequence_length,
339
+ image_token_index,
340
+ image_emb_len,
341
+ image_emb_start,
342
+ image_emb_end,
343
+ pad_token_id,
344
+ )
345
+ prompt_embeds = self.text_encoder(
346
+ **expanded_inputs,
347
+ pixel_values=image_embeds,
348
+ output_hidden_states=True,
349
+ ).hidden_states[-(num_hidden_layers_to_skip + 1)]
350
+ prompt_embeds = prompt_embeds.to(dtype=dtype)
351
+
352
+ if crop_start is not None and crop_start > 0:
353
+ text_crop_start = crop_start - 1 + image_emb_len
354
+ batch_indices, last_double_return_token_indices = torch.where(text_input_ids == double_return_token_id)
355
+
356
+ if last_double_return_token_indices.shape[0] == 3:
357
+ # in case the prompt is too long
358
+ last_double_return_token_indices = torch.cat(
359
+ (last_double_return_token_indices, torch.tensor([text_input_ids.shape[-1]]))
360
+ )
361
+ batch_indices = torch.cat((batch_indices, torch.tensor([0])))
362
+
363
+ last_double_return_token_indices = last_double_return_token_indices.reshape(text_input_ids.shape[0], -1)[
364
+ :, -1
365
+ ]
366
+ batch_indices = batch_indices.reshape(text_input_ids.shape[0], -1)[:, -1]
367
+ assistant_crop_start = last_double_return_token_indices - 1 + image_emb_len - 4
368
+ assistant_crop_end = last_double_return_token_indices - 1 + image_emb_len
369
+ attention_mask_assistant_crop_start = last_double_return_token_indices - 4
370
+ attention_mask_assistant_crop_end = last_double_return_token_indices
371
+
372
+ prompt_embed_list = []
373
+ prompt_attention_mask_list = []
374
+ image_embed_list = []
375
+ image_attention_mask_list = []
376
+
377
+ for i in range(text_input_ids.shape[0]):
378
+ prompt_embed_list.append(
379
+ torch.cat(
380
+ [
381
+ prompt_embeds[i, text_crop_start : assistant_crop_start[i].item()],
382
+ prompt_embeds[i, assistant_crop_end[i].item() :],
383
+ ]
384
+ )
385
+ )
386
+ prompt_attention_mask_list.append(
387
+ torch.cat(
388
+ [
389
+ prompt_attention_mask[i, crop_start : attention_mask_assistant_crop_start[i].item()],
390
+ prompt_attention_mask[i, attention_mask_assistant_crop_end[i].item() :],
391
+ ]
392
+ )
393
+ )
394
+ image_embed_list.append(prompt_embeds[i, image_emb_start:image_emb_end])
395
+ image_attention_mask_list.append(
396
+ torch.ones(image_embed_list[-1].shape[0]).to(prompt_embeds.device).to(prompt_attention_mask.dtype)
397
+ )
398
+
399
+ prompt_embed_list = torch.stack(prompt_embed_list)
400
+ prompt_attention_mask_list = torch.stack(prompt_attention_mask_list)
401
+ image_embed_list = torch.stack(image_embed_list)
402
+ image_attention_mask_list = torch.stack(image_attention_mask_list)
403
+
404
+ if 0 < image_embed_interleave < 6:
405
+ image_embed_list = image_embed_list[:, ::image_embed_interleave, :]
406
+ image_attention_mask_list = image_attention_mask_list[:, ::image_embed_interleave]
407
+
408
+ assert (
409
+ prompt_embed_list.shape[0] == prompt_attention_mask_list.shape[0]
410
+ and image_embed_list.shape[0] == image_attention_mask_list.shape[0]
411
+ )
412
+
413
+ prompt_embeds = torch.cat([image_embed_list, prompt_embed_list], dim=1)
414
+ prompt_attention_mask = torch.cat([image_attention_mask_list, prompt_attention_mask_list], dim=1)
415
+
416
+ return prompt_embeds, prompt_attention_mask
417
+
418
+ def _get_clip_prompt_embeds(
419
+ self,
420
+ prompt: Union[str, List[str]],
421
+ num_videos_per_prompt: int = 1,
422
+ device: Optional[torch.device] = None,
423
+ dtype: Optional[torch.dtype] = None,
424
+ max_sequence_length: int = 77,
425
+ ) -> torch.Tensor:
426
+ device = device or self._execution_device
427
+ dtype = dtype or self.text_encoder_2.dtype
428
+
429
+ prompt = [prompt] if isinstance(prompt, str) else prompt
430
+
431
+ text_inputs = self.tokenizer_2(
432
+ prompt,
433
+ padding="max_length",
434
+ max_length=max_sequence_length,
435
+ truncation=True,
436
+ return_tensors="pt",
437
+ )
438
+
439
+ text_input_ids = text_inputs.input_ids
440
+ untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
441
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
442
+ removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
443
+ logger.warning(
444
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
445
+ f" {max_sequence_length} tokens: {removed_text}"
446
+ )
447
+
448
+ prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False).pooler_output
449
+ return prompt_embeds
450
+
451
+ def encode_prompt(
452
+ self,
453
+ image: torch.Tensor,
454
+ prompt: Union[str, List[str]],
455
+ prompt_2: Union[str, List[str]] = None,
456
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
457
+ num_videos_per_prompt: int = 1,
458
+ prompt_embeds: Optional[torch.Tensor] = None,
459
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
460
+ prompt_attention_mask: Optional[torch.Tensor] = None,
461
+ device: Optional[torch.device] = None,
462
+ dtype: Optional[torch.dtype] = None,
463
+ max_sequence_length: int = 256,
464
+ image_embed_interleave: int = 2,
465
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
466
+ if prompt_embeds is None:
467
+ prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
468
+ image,
469
+ prompt,
470
+ prompt_template,
471
+ num_videos_per_prompt,
472
+ device=device,
473
+ dtype=dtype,
474
+ max_sequence_length=max_sequence_length,
475
+ image_embed_interleave=image_embed_interleave,
476
+ )
477
+
478
+ if pooled_prompt_embeds is None:
479
+ if prompt_2 is None:
480
+ prompt_2 = prompt
481
+ pooled_prompt_embeds = self._get_clip_prompt_embeds(
482
+ prompt,
483
+ num_videos_per_prompt,
484
+ device=device,
485
+ dtype=dtype,
486
+ max_sequence_length=77,
487
+ )
488
+
489
+ return prompt_embeds, pooled_prompt_embeds, prompt_attention_mask
490
+
491
+ def check_inputs(
492
+ self,
493
+ prompt,
494
+ prompt_2,
495
+ height,
496
+ width,
497
+ prompt_embeds=None,
498
+ callback_on_step_end_tensor_inputs=None,
499
+ prompt_template=None,
500
+ true_cfg_scale=1.0,
501
+ guidance_scale=1.0,
502
+ ):
503
+ if height % 16 != 0 or width % 16 != 0:
504
+ raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
505
+
506
+ if callback_on_step_end_tensor_inputs is not None and not all(
507
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
508
+ ):
509
+ raise ValueError(
510
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
511
+ )
512
+
513
+ if prompt is not None and prompt_embeds is not None:
514
+ raise ValueError(
515
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
516
+ " only forward one of the two."
517
+ )
518
+ elif prompt_2 is not None and prompt_embeds is not None:
519
+ raise ValueError(
520
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
521
+ " only forward one of the two."
522
+ )
523
+ elif prompt is None and prompt_embeds is None:
524
+ raise ValueError(
525
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
526
+ )
527
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
528
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
529
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
530
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
531
+
532
+ if prompt_template is not None:
533
+ if not isinstance(prompt_template, dict):
534
+ raise ValueError(f"`prompt_template` has to be of type `dict` but is {type(prompt_template)}")
535
+ if "template" not in prompt_template:
536
+ raise ValueError(
537
+ f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
538
+ )
539
+
540
+ if true_cfg_scale > 1.0 and guidance_scale > 1.0:
541
+ logger.warning(
542
+ "Both `true_cfg_scale` and `guidance_scale` are greater than 1.0. This will result in both "
543
+ "classifier-free guidance and embedded-guidance to be applied. This is not recommended "
544
+ "as it may lead to higher memory usage, slower inference and potentially worse results."
545
+ )
546
+
547
+ def prepare_latents(
548
+ self,
549
+ image: torch.Tensor,
550
+ batch_size: int,
551
+ num_channels_latents: int = 32,
552
+ height: int = 720,
553
+ width: int = 1280,
554
+ num_frames: int = 129,
555
+ dtype: Optional[torch.dtype] = None,
556
+ device: Optional[torch.device] = None,
557
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
558
+ latents: Optional[torch.Tensor] = None,
559
+ image_condition_type: str = "latent_concat",
560
+ ) -> torch.Tensor:
561
+ if isinstance(generator, list) and len(generator) != batch_size:
562
+ raise ValueError(
563
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
564
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
565
+ )
566
+
567
+ num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
568
+ latent_height, latent_width = height // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial
569
+ shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
570
+
571
+ image = image.unsqueeze(2) # [B, C, 1, H, W]
572
+ if isinstance(generator, list):
573
+ image_latents = [
574
+ retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i], "argmax")
575
+ for i in range(batch_size)
576
+ ]
577
+ else:
578
+ image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator, "argmax") for img in image]
579
+
580
+ image_latents = torch.cat(image_latents, dim=0).to(dtype) * self.vae_scaling_factor
581
+ image_latents = image_latents.repeat(1, 1, num_latent_frames, 1, 1)
582
+
583
+ if latents is None:
584
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
585
+ else:
586
+ latents = latents.to(device=device, dtype=dtype)
587
+
588
+ t = torch.tensor([0.999]).to(device=device)
589
+ latents = latents * t + image_latents * (1 - t)
590
+
591
+ if image_condition_type == "token_replace":
592
+ image_latents = image_latents[:, :, :1]
593
+
594
+ return latents, image_latents
595
+
596
+ def enable_vae_slicing(self):
597
+ r"""
598
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
599
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
600
+ """
601
+ self.vae.enable_slicing()
602
+
603
+ def disable_vae_slicing(self):
604
+ r"""
605
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
606
+ computing decoding in one step.
607
+ """
608
+ self.vae.disable_slicing()
609
+
610
+ def enable_vae_tiling(self):
611
+ r"""
612
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
613
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
614
+ processing larger images.
615
+ """
616
+ self.vae.enable_tiling()
617
+
618
+ def disable_vae_tiling(self):
619
+ r"""
620
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
621
+ computing decoding in one step.
622
+ """
623
+ self.vae.disable_tiling()
624
+
625
+ @property
626
+ def guidance_scale(self):
627
+ return self._guidance_scale
628
+
629
+ @property
630
+ def num_timesteps(self):
631
+ return self._num_timesteps
632
+
633
+ @property
634
+ def attention_kwargs(self):
635
+ return self._attention_kwargs
636
+
637
+ @property
638
+ def current_timestep(self):
639
+ return self._current_timestep
640
+
641
+ @property
642
+ def interrupt(self):
643
+ return self._interrupt
644
+
645
+ @torch.no_grad()
646
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
647
+ def __call__(
648
+ self,
649
+ image: PIL.Image.Image,
650
+ prompt: Union[str, List[str]] = None,
651
+ prompt_2: Union[str, List[str]] = None,
652
+ negative_prompt: Union[str, List[str]] = None,
653
+ negative_prompt_2: Union[str, List[str]] = None,
654
+ height: int = 720,
655
+ width: int = 1280,
656
+ num_frames: int = 129,
657
+ num_inference_steps: int = 50,
658
+ sigmas: List[float] = None,
659
+ true_cfg_scale: float = 1.0,
660
+ guidance_scale: float = 1.0,
661
+ num_videos_per_prompt: Optional[int] = 1,
662
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
663
+ latents: Optional[torch.Tensor] = None,
664
+ prompt_embeds: Optional[torch.Tensor] = None,
665
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
666
+ prompt_attention_mask: Optional[torch.Tensor] = None,
667
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
668
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
669
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
670
+ output_type: Optional[str] = "pil",
671
+ return_dict: bool = True,
672
+ attention_kwargs: Optional[Dict[str, Any]] = None,
673
+ callback_on_step_end: Optional[
674
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
675
+ ] = None,
676
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
677
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
678
+ max_sequence_length: int = 256,
679
+ image_embed_interleave: Optional[int] = None,
680
+ ):
681
+ r"""
682
+ The call function to the pipeline for generation.
683
+
684
+ Args:
685
+ prompt (`str` or `List[str]`, *optional*):
686
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
687
+ instead.
688
+ prompt_2 (`str` or `List[str]`, *optional*):
689
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
690
+ will be used instead.
691
+ negative_prompt (`str` or `List[str]`, *optional*):
692
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
693
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
694
+ not greater than `1`).
695
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
696
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
697
+ `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
698
+ height (`int`, defaults to `720`):
699
+ The height in pixels of the generated image.
700
+ width (`int`, defaults to `1280`):
701
+ The width in pixels of the generated image.
702
+ num_frames (`int`, defaults to `129`):
703
+ The number of frames in the generated video.
704
+ num_inference_steps (`int`, defaults to `50`):
705
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
706
+ expense of slower inference.
707
+ sigmas (`List[float]`, *optional*):
708
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
709
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
710
+ will be used.
711
+ true_cfg_scale (`float`, *optional*, defaults to 1.0):
712
+ When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
713
+ guidance_scale (`float`, defaults to `1.0`):
714
+ Guidance scale as defined in [Classifier-Free Diffusion
715
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
716
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
717
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
718
+ the text `prompt`, usually at the expense of lower image quality. Note that the only available
719
+ HunyuanVideo model is CFG-distilled, which means that traditional guidance between unconditional and
720
+ conditional latent is not applied.
721
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
722
+ The number of images to generate per prompt.
723
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
724
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
725
+ generation deterministic.
726
+ latents (`torch.Tensor`, *optional*):
727
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
728
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
729
+ tensor is generated by sampling using the supplied random `generator`.
730
+ prompt_embeds (`torch.Tensor`, *optional*):
731
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
732
+ provided, text embeddings are generated from the `prompt` input argument.
733
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
734
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
735
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
736
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
737
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
738
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
739
+ argument.
740
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
741
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
742
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
743
+ input argument.
744
+ output_type (`str`, *optional*, defaults to `"pil"`):
745
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
746
+ return_dict (`bool`, *optional*, defaults to `True`):
747
+ Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a plain tuple.
748
+ attention_kwargs (`dict`, *optional*):
749
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
750
+ `self.processor` in
751
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
752
+ clip_skip (`int`, *optional*):
753
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
754
+ the output of the pre-final layer will be used for computing the prompt embeddings.
755
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
756
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
757
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
758
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
759
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
760
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
761
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
762
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
763
+ `._callback_tensor_inputs` attribute of your pipeline class.
764
+
765
+ Examples:
766
+
767
+ Returns:
768
+ [`~HunyuanVideoPipelineOutput`] or `tuple`:
769
+ If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned, otherwise a `tuple` is returned
770
+ where the first element is a list with the generated images and the second element is a list of `bool`s
771
+ indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
772
+ """
773
+
774
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
775
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
776
+
777
+ # 1. Check inputs. Raise error if not correct
778
+ self.check_inputs(
779
+ prompt,
780
+ prompt_2,
781
+ height,
782
+ width,
783
+ prompt_embeds,
784
+ callback_on_step_end_tensor_inputs,
785
+ prompt_template,
786
+ true_cfg_scale,
787
+ guidance_scale,
788
+ )
789
+
790
+ image_condition_type = self.transformer.config.image_condition_type
791
+ has_neg_prompt = negative_prompt is not None or (
792
+ negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
793
+ )
794
+ do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
795
+ image_embed_interleave = (
796
+ image_embed_interleave
797
+ if image_embed_interleave is not None
798
+ else (
799
+ 2 if image_condition_type == "latent_concat" else 4 if image_condition_type == "token_replace" else 1
800
+ )
801
+ )
802
+
803
+ self._guidance_scale = guidance_scale
804
+ self._attention_kwargs = attention_kwargs
805
+ self._current_timestep = None
806
+ self._interrupt = False
807
+
808
+ device = self._execution_device
809
+
810
+ # 2. Define call parameters
811
+ if prompt is not None and isinstance(prompt, str):
812
+ batch_size = 1
813
+ elif prompt is not None and isinstance(prompt, list):
814
+ batch_size = len(prompt)
815
+ else:
816
+ batch_size = prompt_embeds.shape[0]
817
+
818
+ # 3. Prepare latent variables
819
+ vae_dtype = self.vae.dtype
820
+ image_tensor = self.video_processor.preprocess(image, height, width).to(device, vae_dtype)
821
+
822
+ if image_condition_type == "latent_concat":
823
+ num_channels_latents = (self.transformer.config.in_channels - 1) // 2
824
+ elif image_condition_type == "token_replace":
825
+ num_channels_latents = self.transformer.config.in_channels
826
+
827
+ latents, image_latents = self.prepare_latents(
828
+ image_tensor,
829
+ batch_size * num_videos_per_prompt,
830
+ num_channels_latents,
831
+ height,
832
+ width,
833
+ num_frames,
834
+ torch.float32,
835
+ device,
836
+ generator,
837
+ latents,
838
+ image_condition_type,
839
+ )
840
+ if image_condition_type == "latent_concat":
841
+ image_latents[:, :, 1:] = 0
842
+ mask = image_latents.new_ones(image_latents.shape[0], 1, *image_latents.shape[2:])
843
+ mask[:, :, 1:] = 0
844
+
845
+ # 4. Encode input prompt
846
+ transformer_dtype = self.transformer.dtype
847
+ prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
848
+ image=image,
849
+ prompt=prompt,
850
+ prompt_2=prompt_2,
851
+ prompt_template=prompt_template,
852
+ num_videos_per_prompt=num_videos_per_prompt,
853
+ prompt_embeds=prompt_embeds,
854
+ pooled_prompt_embeds=pooled_prompt_embeds,
855
+ prompt_attention_mask=prompt_attention_mask,
856
+ device=device,
857
+ max_sequence_length=max_sequence_length,
858
+ image_embed_interleave=image_embed_interleave,
859
+ )
860
+ prompt_embeds = prompt_embeds.to(transformer_dtype)
861
+ prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
862
+ pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
863
+
864
+ if do_true_cfg:
865
+ black_image = PIL.Image.new("RGB", (width, height), 0)
866
+ negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
867
+ image=black_image,
868
+ prompt=negative_prompt,
869
+ prompt_2=negative_prompt_2,
870
+ prompt_template=prompt_template,
871
+ num_videos_per_prompt=num_videos_per_prompt,
872
+ prompt_embeds=negative_prompt_embeds,
873
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
874
+ prompt_attention_mask=negative_prompt_attention_mask,
875
+ device=device,
876
+ max_sequence_length=max_sequence_length,
877
+ )
878
+ negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
879
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(transformer_dtype)
880
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(transformer_dtype)
881
+
882
+ # 5. Prepare timesteps
883
+ sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
884
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
885
+
886
+ # 6. Prepare guidance condition
887
+ guidance = None
888
+ if self.transformer.config.guidance_embeds:
889
+ guidance = (
890
+ torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
891
+ )
892
+
893
+ # 7. Denoising loop
894
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
895
+ self._num_timesteps = len(timesteps)
896
+
897
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
898
+ for i, t in enumerate(timesteps):
899
+ if self.interrupt:
900
+ continue
901
+
902
+ self._current_timestep = t
903
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
904
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
905
+
906
+ if image_condition_type == "latent_concat":
907
+ latent_model_input = torch.cat([latents, image_latents, mask], dim=1).to(transformer_dtype)
908
+ elif image_condition_type == "token_replace":
909
+ latent_model_input = torch.cat([image_latents, latents[:, :, 1:]], dim=2).to(transformer_dtype)
910
+
911
+ noise_pred = self.transformer(
912
+ hidden_states=latent_model_input,
913
+ timestep=timestep,
914
+ encoder_hidden_states=prompt_embeds,
915
+ encoder_attention_mask=prompt_attention_mask,
916
+ pooled_projections=pooled_prompt_embeds,
917
+ guidance=guidance,
918
+ attention_kwargs=attention_kwargs,
919
+ return_dict=False,
920
+ )[0]
921
+
922
+ if do_true_cfg:
923
+ neg_noise_pred = self.transformer(
924
+ hidden_states=latent_model_input,
925
+ timestep=timestep,
926
+ encoder_hidden_states=negative_prompt_embeds,
927
+ encoder_attention_mask=negative_prompt_attention_mask,
928
+ pooled_projections=negative_pooled_prompt_embeds,
929
+ guidance=guidance,
930
+ attention_kwargs=attention_kwargs,
931
+ return_dict=False,
932
+ )[0]
933
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
934
+
935
+ # compute the previous noisy sample x_t -> x_t-1
936
+ if image_condition_type == "latent_concat":
937
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
938
+ elif image_condition_type == "token_replace":
939
+ latents = latents = self.scheduler.step(
940
+ noise_pred[:, :, 1:], t, latents[:, :, 1:], return_dict=False
941
+ )[0]
942
+ latents = torch.cat([image_latents, latents], dim=2)
943
+
944
+ if callback_on_step_end is not None:
945
+ callback_kwargs = {}
946
+ for k in callback_on_step_end_tensor_inputs:
947
+ callback_kwargs[k] = locals()[k]
948
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
949
+
950
+ latents = callback_outputs.pop("latents", latents)
951
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
952
+
953
+ # call the callback, if provided
954
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
955
+ progress_bar.update()
956
+
957
+ if XLA_AVAILABLE:
958
+ xm.mark_step()
959
+
960
+ self._current_timestep = None
961
+
962
+ if not output_type == "latent":
963
+ latents = latents.to(self.vae.dtype) / self.vae_scaling_factor
964
+ video = self.vae.decode(latents, return_dict=False)[0]
965
+ if image_condition_type == "latent_concat":
966
+ video = video[:, :, 4:, :, :]
967
+ video = self.video_processor.postprocess_video(video, output_type=output_type)
968
+ else:
969
+ if image_condition_type == "latent_concat":
970
+ video = latents[:, :, 1:, :, :]
971
+ else:
972
+ video = latents
973
+
974
+ # Offload all models
975
+ self.maybe_free_model_hooks()
976
+
977
+ if not return_dict:
978
+ return (video,)
979
+
980
+ return HunyuanVideoPipelineOutput(frames=video)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuandit/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.04 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuandit/__pycache__/pipeline_hunyuandit.cpython-310.pyc ADDED
Binary file (28.2 kB). View file